In [1]:
!pip install idx2numpy
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split

from google.colab import drive
drive.mount('/content/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Collecting idx2numpy
  Downloading idx2numpy-1.2.3.tar.gz (6.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: idx2numpy
  Building wheel for idx2numpy (setup.py) ... [?25l[?25hdone
  Created wheel for idx2numpy: filename=idx2numpy-1.2.3-py3-none-any.whl size=7903 sha256=17f9c061da1d57e81b8843b3c22d6b2b12b48384521cb87833a2c6c45929b03b
  Stored in directory: /root/.cache/pip/wheels/f7/48/00/ae031c97d62f39e1c3c4daa00426c09a65eb29ae5753a189ee
Successfully built idx2numpy
Installing collected packages: idx2numpy
Successfully installed idx2numpy-1.2.3
Mounted at /content/drive
Device: cuda


In [2]:
MNIST_DIR = "/content/drive/MyDrive/AIProject/numbers"
print("MNIST_DIR:", MNIST_DIR)
print("Exists?", os.path.exists(MNIST_DIR))
print("Contents:", os.listdir(MNIST_DIR) if os.path.exists(MNIST_DIR) else "NO DIR")


MNIST_DIR: /content/drive/MyDrive/AIProject/numbers
Exists? True
Contents: ['t10k-labels.idx1-ubyte', 't10k-images.idx3-ubyte', 'train-images.idx3-ubyte', 'train-labels.idx1-ubyte', 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte', 'train-labels-idx1-ubyte', 'train-images-idx3-ubyte']


In [3]:
# compares predictions to true labels and returns the accuracy
def accuracy_from_logits(logits, targets):
    preds = logits.argmax(dim=1)
    correct = (preds == targets).sum().item()
    total = targets.size(0)
    return correct / total

# loops over each batch of images and labels from train_dl
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    running_acc  = 0.0
    total_batches = 0

    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_acc  += accuracy_from_logits(outputs, labels)
        total_batches += 1

    return running_loss / total_batches, running_acc / total_batches

# Loops over all batches in val_dl or test_dl, computes outputs and loss, and averages loss and accuracy
def eval_model(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    running_acc  = 0.0
    total_batches = 0

    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            running_acc  += accuracy_from_logits(outputs, labels)
            total_batches += 1

    return running_loss / total_batches, running_acc / total_batches


In [4]:
MNIST_DIR = "/content/drive/MyDrive/AIProject/numbers"

train_images_path = f"{MNIST_DIR}/train-images.idx3-ubyte"
train_labels_path = f"{MNIST_DIR}/train-labels.idx1-ubyte"
test_images_path  = f"{MNIST_DIR}/t10k-images.idx3-ubyte"
test_labels_path  = f"{MNIST_DIR}/t10k-labels.idx1-ubyte"

print(train_images_path)
print(test_images_path)


/content/drive/MyDrive/AIProject/numbers/train-images.idx3-ubyte
/content/drive/MyDrive/AIProject/numbers/t10k-images.idx3-ubyte


In [5]:
import idx2numpy
import numpy as np

X_mnist_train = idx2numpy.convert_from_file(train_images_path)
y_mnist_train = idx2numpy.convert_from_file(train_labels_path)
X_mnist_test  = idx2numpy.convert_from_file(test_images_path)
y_mnist_test  = idx2numpy.convert_from_file(test_labels_path)


In [6]:
AZ_DIR = "/content/drive/MyDrive/AIProject/letters kaggle"
AZ_CSV = f"{AZ_DIR}/A_Z Handwritten Data.csv"
az_df = pd.read_csv(AZ_CSV, header=None)
y_az = az_df.iloc[:, 0].values.astype(np.int64)      # 0–25 (A–Z) corresponds to letter position in alphabet
X_az = az_df.iloc[:, 1:].values.astype(np.float32)   # 784 pixels in float format

X_az /= 255.0   # scales pixel values from 0–255 to 0–1
X_az = X_az.reshape(-1, 1, 28, 28)   # turns each row of 784 numbers into a 28×28 grayscale image with 1 channel

# 0-9 for digits, 10-35 for letters
y_az_shifted = y_az + 10  # letters -> 10..35
print(X_az.shape, y_az_shifted.min(), y_az_shifted.max())

(372451, 1, 28, 28) 10 35


In [7]:
print("X_mnist_train:", X_mnist_train.shape)
print("X_mnist_test:",  X_mnist_test.shape)
print("X_az:",          X_az.shape)


X_mnist_train: (60000, 28, 28)
X_mnist_test: (10000, 28, 28)
X_az: (372451, 1, 28, 28)


In [8]:
# if they are (N, 28, 28), add channel dimension
if X_mnist_train.ndim == 3:
    X_mnist_train = X_mnist_train[:, None, :, :]
if X_mnist_test.ndim == 3:
    X_mnist_test  = X_mnist_test[:, None, :, :]

print("X_mnist_train:", X_mnist_train.shape)  # should be (N, 1, 28, 28)
print("X_mnist_test:",  X_mnist_test.shape)


X_mnist_train: (60000, 1, 28, 28)
X_mnist_test: (10000, 1, 28, 28)


In [9]:
# after reading A_Z CSV and normalizing:
X_az = X_az.reshape(-1, 1, 28, 28)
print("X_az:", X_az.shape)


X_az: (372451, 1, 28, 28)


In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split

# combine MNIST train+test
X_digits = np.concatenate([X_mnist_train, X_mnist_test], axis=0)
y_digits = np.concatenate([y_mnist_train, y_mnist_test], axis=0)

X_all = np.concatenate([X_digits, X_az], axis=0)
y_all = np.concatenate([y_digits, y_az_shifted], axis=0)

X_all_tensor = torch.from_numpy(X_all)
y_all_tensor = torch.from_numpy(y_all)

dataset = TensorDataset(X_all_tensor, y_all_tensor)

total_len = len(dataset)
train_len = int(0.8 * total_len)
val_len   = int(0.1 * total_len)
test_len  = total_len - train_len - val_len

train_ds, val_ds, test_ds = random_split(
    dataset, [train_len, val_len, test_len],
    generator=torch.Generator().manual_seed(42)
)

BATCH_SIZE = 128
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_dl  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)


In [11]:
class CharDigitCNN(nn.Module):
    def __init__(self, num_classes=36):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.fc1   = nn.Linear(128 * 7 * 7, 256)
        self.fc2   = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

model = CharDigitCNN(num_classes=36).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [12]:
EPOCHS = 15
for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = train_one_epoch(model, train_dl, optimizer, criterion, device)
    val_loss, val_acc     = eval_model(model, val_dl, criterion, device)
    print(
        f"Epoch {epoch:02d}: "
        f"train_loss={train_loss:.4f}, train_acc={train_acc*100:.2f}% | "
        f"val_loss={val_loss:.4f}, val_acc={val_acc*100:.2f}%"
    )


Epoch 01: train_loss=0.2912, train_acc=91.88% | val_loss=0.0932, val_acc=97.41%
Epoch 02: train_loss=0.0749, train_acc=97.93% | val_loss=0.0826, val_acc=97.70%
Epoch 03: train_loss=0.0564, train_acc=98.45% | val_loss=0.0576, val_acc=98.44%
Epoch 04: train_loss=0.0445, train_acc=98.75% | val_loss=0.0549, val_acc=98.45%
Epoch 05: train_loss=0.0372, train_acc=98.94% | val_loss=0.0504, val_acc=98.54%
Epoch 06: train_loss=0.0314, train_acc=99.11% | val_loss=0.0442, val_acc=98.87%
Epoch 07: train_loss=0.0267, train_acc=99.24% | val_loss=0.0480, val_acc=98.86%
Epoch 08: train_loss=0.0233, train_acc=99.33% | val_loss=0.0468, val_acc=99.00%
Epoch 09: train_loss=0.0208, train_acc=99.41% | val_loss=0.0404, val_acc=99.01%
Epoch 10: train_loss=0.0195, train_acc=99.46% | val_loss=0.0431, val_acc=98.97%
Epoch 11: train_loss=0.0164, train_acc=99.54% | val_loss=0.0466, val_acc=98.98%
Epoch 12: train_loss=0.0165, train_acc=99.56% | val_loss=0.0442, val_acc=99.06%
Epoch 13: train_loss=0.0148, train_acc=9

In [13]:
test_loss, test_acc = eval_model(model, test_dl, criterion, device)
print(f"Test loss={test_loss:.4f}, test acc={test_acc*100:.2f}%")


Test loss=0.0669, test acc=99.16%
