In [1]:
!pip install transformers pdfplumber pytorch_lightning pytesseract scikit-learn --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m352.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m825.4/825.4 kB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pytesseract
import pdfplumber
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW

In [3]:
def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

def extract_text_from_image(path):
    return pytesseract.image_to_string(Image.open(path))

invoice_text = extract_text_from_pdf("/content/Purchase Invoice.pdf")
bank_text = extract_text_from_pdf("/content/Bank Statement.pdf")
receipt_text = extract_text_from_image("/content/Purchase Receipt.jpg")

In [4]:
texts = [invoice_text, bank_text, receipt_text]
labels = ["Invoice", "Bank Statement", "Money Receipt"]
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)

for i in range(len(texts)):
    print(f"{labels[i]} => {encoded_labels[i]}")


Invoice => 1
Bank Statement => 0
Money Receipt => 2


In [6]:
from google.colab import userdata
try:
    my_secret = userdata.get('HF_TOKEN')
    print("Secret retrieved successfully")
except userdata.SecretNotFoundError:
    print("Secret not found")
hf_token = my_secret

Secret retrieved successfully


In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", token=hf_token)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
class DocumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length',
                                  max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }
dataset = DocumentDataset(texts, encoded_labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=2)


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    token=hf_token
)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model.train()
for epoch in range(10):
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} loss: {loss.item():.4f}")


Epoch 1 loss: 1.1998
Epoch 2 loss: 0.9465
Epoch 3 loss: 0.9118
Epoch 4 loss: 0.8894
Epoch 5 loss: 0.6150
Epoch 6 loss: 0.6542
Epoch 7 loss: 0.6732
Epoch 8 loss: 0.4861
Epoch 9 loss: 0.4253
Epoch 10 loss: 0.3347


In [12]:
model.eval()
predictions = []
class_names = le.inverse_transform([0, 1, 2])
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
print("\nClassification Results:")
print(classification_report(encoded_labels, predictions, target_names=class_names))



Classification Results:
                precision    recall  f1-score   support

Bank Statement       1.00      1.00      1.00         1
       Invoice       1.00      1.00      1.00         1
 Money Receipt       1.00      1.00      1.00         1

      accuracy                           1.00         3
     macro avg       1.00      1.00      1.00         3
  weighted avg       1.00      1.00      1.00         3



In [13]:
def classify_document(path, model, tokenizer, label_encoder, max_len=512):
    if path.lower().endswith(".pdf"):
        text = extract_text_from_pdf(path)
    elif path.lower().endswith((".jpg", ".jpeg", ".png")):
        text = extract_text_from_image(path)
    else:
        raise ValueError("Unsupported file type. Use PDF or JPG/PNG.")
    if not text.strip():
        return "No readable text found."
    encoding = tokenizer(text, truncation=True, padding='max_length',
                         max_length=max_len, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()
        label = label_encoder.inverse_transform([pred])[0]
    return label


In [14]:
file_path = "/content/Purchase Invoice.pdf"  # Replace with your test file path
result = classify_document(file_path, model, tokenizer, le)
print(f"Predicted Document Type: {result}")

Predicted Document Type: Invoice


In [15]:
file_path = "/content/Bank Statement.pdf"  # Replace with your test file path
result = classify_document(file_path, model, tokenizer, le)
print(f"Predicted Document Type: {result}")

Predicted Document Type: Bank Statement


In [16]:
file_path = "/content/Purchase Receipt.jpg"  # Replace with your test file path
result = classify_document(file_path, model, tokenizer, le)
print(f"Predicted Document Type: {result}")

Predicted Document Type: Money Receipt
