In [26]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset

class DocumentDataset(Dataset):
    def __init__(self, img_dir, labels_excel, transform=None):
        """
        img_dir:   path to folder containing images (e.g. 'data/classification/train')
        labels_excel: filename of the Excel sheet in that folder
        transform: torchvision transforms to apply
        """
        self.img_dir = img_dir
        self.transform = transform

        # 1) Read the Excel file into a DataFrame
        df = pd.read_csv(os.path.join(img_dir, labels_excel))

        # 2) Build a mapping from class name → index
        self.class_names = [' cheque', ' payslip', ' receipt', ' tax']
        self.class_to_idx = {c:i for i,c in enumerate(self.class_names)}

        # 3) Build a list of (image_path, label_index)
        self.samples = []
        for _, row in df.iterrows():
            fname = row['filename']
            full_path = os.path.join(img_dir, fname)
            if not os.path.isfile(full_path):
                continue  # skip missing files
            # find which column has a 1
            for cls in self.class_names:
                if row[cls] == 1:
                    label = self.class_to_idx[cls]
                    self.samples.append((full_path, label))
                    break
        print(f"Loaded {len(self.samples)} samples from {img_dir}")

        print("First 5 entries:", self.samples[:5])

    

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        return img, label


In [36]:

from torch.utils.data import DataLoader

from torchvision import transforms

tform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),                           # should map [0,255]→[0.0,1.0]
    transforms.Lambda(lambda x: x / 255.0),          # <-- force it if ToTensor() is misbehaving
    transforms.Normalize(
        mean=[0.485,0.456,0.406],
        std=[0.229,0.224,0.225]
    ),
])


# instantiate datasets
train_ds = DocumentDataset("C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\train", '_classes.csv', transform=tform)
val_ds   = DocumentDataset("C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\valid", '_classes.csv', transform=tform)
test_ds  = DocumentDataset("C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\test", '_classes.csv', transform=tform)

from collections import Counter
train_counts = Counter(label for _, label in train_ds.samples)
print("✅ Train class counts:", train_counts)

# grab one raw PIL image and one transformed tensor
from PIL import Image
import numpy as np

# 1) load a raw PIL image
raw_path, _ = train_ds.samples[0]
raw_img = Image.open(raw_path).convert('RGB')
arr = np.array(raw_img)
print("RAW image pixel range:", arr.min(), arr.max())

# 2) run your transform pipeline manually
t = train_ds.transform
tensor_img = t(raw_img)
print("TRANSFORMED tensor shape:", tensor_img.shape)
print("TRANSFORMED pixel range:", tensor_img.min().item(), tensor_img.max().item())
print("Transform pipeline:", train_ds.transform)
raw_img = Image.open(train_ds.samples[0][0]).convert('RGB')
tensor_img = tform(raw_img)
print("Fixed TRANSFORMED range:", tensor_img.min().item(), tensor_img.max().item())


# loaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=32)
test_loader  = DataLoader(test_ds,  batch_size=32)


Loaded 302 samples from C:\Users\praag\Desktop\scrapiq\doc scanner\financial classification.v1i.multiclass\train
First 5 entries: [('C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\train\\51_jpg.rf.e0e7740d132b86db19ea3bfd18bb14d0.jpg', 0), ('C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\train\\receipt_00207_png_jpg.rf.e046c0fa8968ce4657787e6f1b758e13.jpg', 2), ('C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\train\\33_jpg.rf.dd361eb32c7c387b408de5f432c1ab88.jpg', 0), ('C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\train\\Screenshot-from-2024-01-11-15-59-43_png.rf.db19d7fc74d12d398be3ae9ee0b9f9e5.jpg', 3), ('C:\\Users\\praag\\Desktop\\scrapiq\\doc scanner\\financial classification.v1i.multiclass\\train\\75_jpg.rf.dfedf5eef0e1407b5814881e54de94b4.jpg', 0)]
Loaded 53 samples from C:\Users\praag\Desktop\scrapiq\doc sca

In [32]:
# after defining train_loader, val_loader…
import torch
from torchvision import models
# count classes automatically
num_classes = len(train_ds.class_names)
device = torch.device("cpu")
#torch.set_num_threads(4)

# model setup…
model = models.resnet50(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, num_classes)
model = model.to(device)

from torch.utils.data import Subset, DataLoader

# Take a small subset of 16 images from training set
tiny_subset = Subset(train_ds, list(range(16)))
tiny_loader = DataLoader(tiny_subset, batch_size=4, shuffle=True)

# Freeze all layers except the final fc
for param in model.parameters():
    param.requires_grad = False
for param in model.fc.parameters():
    param.requires_grad = True

# Re-define optimizer (only for fc)
optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-3)
# Grab one batch
imgs, labels = next(iter(tiny_loader))
print("Batch imgs shape:", imgs.shape)                 # should be [4,3,224,224]
print("Pixel range:", imgs.min().item(), imgs.max().item())
print("Labels:", labels)

# Forward pass to get raw logits
with torch.no_grad():
    logits = model(imgs.to(device))
print("Logits range:", logits.min().item(), logits.max().item())

# print("\n🔁 Overfitting on 16 samples...\n")
# for epoch in range(20):
#     model.train()
#     total_loss = 0
#     correct = 0
#     for imgs, labels in tiny_loader:
#         imgs, labels = imgs.to(device), labels.to(device)
#         optimizer.zero_grad()
#         outputs = model(imgs)
#         loss = loss_fn(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#         preds = outputs.argmax(dim=1)
#         correct += (preds == labels).sum().item()
#     acc = 100 * correct / len(tiny_subset)
#     print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}, Accuracy = {acc:.2f}%")

# training + validation loop…


Batch imgs shape: torch.Size([4, 3, 224, 224])
Pixel range: -181600174080.0 -278578048.0
Labels: tensor([2, 0, 1, 0])
Logits range: -5645.06201171875 3628.412109375


In [10]:
# 1. Define the loss function and optimizer
loss_fn   = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# 2. Training + Validation Loop
num_epochs = 3
for epoch in range(num_epochs):
    # ——— Training Phase ———
    model.train()
    running_loss = 0.0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        # Forward pass
        outputs = model(imgs)
        loss    = loss_fn(outputs, labels)

        # Backward pass + optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs} — Training Loss: {epoch_loss:.4f}")

    # ——— Validation Phase ———
    model.eval()
    correct = 0
    total   = 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            preds   = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

    val_acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs} — Validation Accuracy: {val_acc:.2f}%\n")

# 3. Save the trained model weights
torch.save(model.state_dict(), "model_classify.pt")
print("✅ Training complete. Model weights saved to model_classify.pt")


Epoch 1/3 — Training Loss: 204.8275
Epoch 1/3 — Validation Accuracy: 16.98%

Epoch 2/3 — Training Loss: 213.9091
Epoch 2/3 — Validation Accuracy: 15.09%

Epoch 3/3 — Training Loss: 158.2696
Epoch 3/3 — Validation Accuracy: 16.98%

✅ Training complete. Model weights saved to model_classify.pt


In [1]:
import os
from collections import Counter
import numpy as np
from PIL import Image

import torch
from torch.utils.data import DataLoader, Subset
from torchvision import transforms, models

# 1) Define the corrected transform pipeline
tform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),                           # 0–255 → 0.0–1.0
    transforms.Lambda(lambda x: x / 255.0),          # force scaling if needed
    transforms.Normalize(                           # now normalize to ImageNet stats
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

# 2) (Re)instantiate your datasets with the new pipeline
class DocumentDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, labels_csv, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        df = __import__('pandas').read_csv(os.path.join(img_dir, labels_csv))
        df.columns = df.columns.str.strip()           # strip any spaces
        self.class_names = ['cheque', 'payslip', 'receipt', 'tax']
        self.class_to_idx = {c:i for i,c in enumerate(self.class_names)}

        self.samples = []
        for _, row in df.iterrows():
            fname = row['filename']
            full_path = os.path.join(img_dir, fname)
            if not os.path.isfile(full_path):
                continue
            # find the one-hot label
            for cls in self.class_names:
                if row[cls] == 1:
                    self.samples.append((full_path, self.class_to_idx[cls]))
                    break

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        return img, label

# Paths to your splits
base = r"C:\Users\praag\Desktop\scrapiq\doc scanner\financial classification.v1i.multiclass"
train_ds = DocumentDataset(os.path.join(base, "train"), '_classes.csv', transform=tform)
val_ds   = DocumentDataset(os.path.join(base, "valid"), '_classes.csv', transform=tform)
test_ds  = DocumentDataset(os.path.join(base, "test"),  '_classes.csv', transform=tform)

# 3) Quick sanity‑checks
print("✅ Train class counts:", Counter(label for _, label in train_ds.samples))

raw_path, _ = train_ds.samples[0]
raw_img = Image.open(raw_path).convert('RGB')
arr = np.array(raw_img)
print("RAW image pixel range:", arr.min(), arr.max())

tensor_img = train_ds.transform(raw_img)
print("TRANSFORMED tensor shape:", tensor_img.shape)
print("TRANSFORMED pixel range:", tensor_img.min().item(), tensor_img.max().item())

# 4) Create your DataLoaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=32)
test_loader  = DataLoader(test_ds,  batch_size=32)

# 5) (Optional) Model setup for the overfit test
device = torch.device("cpu")
model = models.resnet50(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, len(train_ds.class_names))
model = model.to(device)

print("\nTransforms and datasets are now correctly configured. Ready to train!")


✅ Train class counts: Counter({3: 80, 0: 77, 1: 73, 2: 72})
RAW image pixel range: 0 255
TRANSFORMED tensor shape: torch.Size([3, 224, 224])
TRANSFORMED pixel range: -2.1135387420654297 -1.7870151996612549





Transforms and datasets are now correctly configured. Ready to train!


In [5]:
from torch.utils.data import Subset, DataLoader

# 1. Create a tiny subset of 16 samples from your training data
# UNFREEZE the whole model
for param in model.parameters():
    param.requires_grad = True

model.train()  # full model in train mode

# Recreate optimizer (now includes all model params)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)  # lower LR for stability
loss_fn   = torch.nn.CrossEntropyLoss()

# Overfit on 16 samples again
tiny_subset = Subset(train_ds, list(range(16)))
tiny_loader = DataLoader(tiny_subset, batch_size=4, shuffle=True)

print("\n🔁 Retesting overfit on 16 training samples...\n")
for epoch in range(20):
    total_loss = 0
    correct = 0
    for imgs, labels in tiny_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    acc = 100 * correct / len(tiny_subset)
    print(f"Epoch {epoch+1:2d}: Loss = {total_loss:.4f}, Accuracy = {acc:.2f}%")



🔁 Retesting overfit on 16 training samples...

Epoch  1: Loss = 13.5608, Accuracy = 43.75%
Epoch  2: Loss = 5.6268, Accuracy = 68.75%
Epoch  3: Loss = 2.8739, Accuracy = 75.00%
Epoch  4: Loss = 3.8166, Accuracy = 68.75%
Epoch  5: Loss = 1.1304, Accuracy = 87.50%
Epoch  6: Loss = 1.4768, Accuracy = 81.25%
Epoch  7: Loss = 1.6506, Accuracy = 75.00%
Epoch  8: Loss = 1.0810, Accuracy = 93.75%
Epoch  9: Loss = 0.7262, Accuracy = 93.75%
Epoch 10: Loss = 0.4355, Accuracy = 100.00%
Epoch 11: Loss = 1.7155, Accuracy = 75.00%
Epoch 12: Loss = 0.5343, Accuracy = 100.00%
Epoch 13: Loss = 1.1132, Accuracy = 93.75%
Epoch 14: Loss = 1.8247, Accuracy = 81.25%
Epoch 15: Loss = 0.6081, Accuracy = 100.00%
Epoch 16: Loss = 0.6020, Accuracy = 93.75%
Epoch 17: Loss = 0.4681, Accuracy = 93.75%
Epoch 18: Loss = 1.6566, Accuracy = 87.50%
Epoch 19: Loss = 0.8456, Accuracy = 93.75%
Epoch 20: Loss = 1.8844, Accuracy = 75.00%


In [7]:


for param in model.parameters():
    param.requires_grad = True
model.train()



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
loss_fn = torch.nn.CrossEntropyLoss()


In [11]:
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x / 255.0),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x / 255.0),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])


In [13]:
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    running_loss, correct, total = 0, 0, 0

    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        correct += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)

    train_acc = 100 * correct / total
    train_loss = running_loss / total

    # Validation
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
    val_acc = 100 * correct / total

    print(f"Epoch {epoch+1}/{num_epochs} — Train Loss: {train_loss:.4f} — Train Acc: {train_acc:.2f}% — Val Acc: {val_acc:.2f}%")
    scheduler.step()
torch.save(model.state_dict(), "best_model.pt")


Epoch 1/15 — Train Loss: 0.8681 — Train Acc: 79.47% — Val Acc: 26.42%
Epoch 2/15 — Train Loss: 0.3125 — Train Acc: 88.74% — Val Acc: 16.98%
Epoch 3/15 — Train Loss: 0.1790 — Train Acc: 93.71% — Val Acc: 32.08%
Epoch 4/15 — Train Loss: 0.1127 — Train Acc: 96.36% — Val Acc: 16.98%
Epoch 5/15 — Train Loss: 0.0816 — Train Acc: 96.36% — Val Acc: 24.53%
Epoch 6/15 — Train Loss: 0.0566 — Train Acc: 97.68% — Val Acc: 83.02%
Epoch 7/15 — Train Loss: 0.0587 — Train Acc: 98.34% — Val Acc: 58.49%
Epoch 8/15 — Train Loss: 0.0466 — Train Acc: 98.34% — Val Acc: 52.83%
Epoch 9/15 — Train Loss: 0.0203 — Train Acc: 99.01% — Val Acc: 49.06%
Epoch 10/15 — Train Loss: 0.0114 — Train Acc: 99.67% — Val Acc: 86.79%
Epoch 11/15 — Train Loss: 0.0145 — Train Acc: 99.67% — Val Acc: 64.15%
Epoch 12/15 — Train Loss: 0.0087 — Train Acc: 99.67% — Val Acc: 67.92%
Epoch 13/15 — Train Loss: 0.0039 — Train Acc: 100.00% — Val Acc: 90.57%
Epoch 14/15 — Train Loss: 0.0010 — Train Acc: 100.00% — Val Acc: 94.34%
Epoch 15/15 —

In [17]:
from datasets import load_dataset

dataset = load_dataset("ds4sd/DocLayNet")

dataset
DatasetDict({
    train: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'doc_category', 'collection', 'doc_name', 'page_no', 'objects'],
        num_rows: 69375
    })
    validation: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'doc_category', 'collection', 'doc_name', 'page_no', 'objects'],
        num_rows: 6489
    })
    test: Dataset({
        features: ['image_id', 'image', 'width', 'height', 'doc_category', 'collection', 'doc_name', 'page_no', 'objects'],
        num_rows: 4999
    })
})

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3948470738.py, line 7)