In [2]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract
import torch
from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=3)

Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def load_images_from_directory(base_dir):
    images = []
    labels = []
    for doc_type in os.listdir(base_dir):
        doc_type_path = os.path.join(base_dir, doc_type)
        if os.path.isdir(doc_type_path):
            for filename in os.listdir(doc_type_path):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                    img_path = os.path.join(doc_type_path, filename)
                    try:
                        img = cv2.imread(img_path)
                        if img is None:
                            raise ValueError(f"Unable to read image: {img_path}")
                        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                        images.append((filename, img))
                        labels.append(doc_type)
                        logging.info(f"Successfully loaded: {img_path}")
                    except Exception as e:
                        logging.error(f"Error loading {img_path}: {str(e)}")
    return images, labels

In [5]:
def perform_ocr(image):
    try:
        if not isinstance(image, np.ndarray):
            raise ValueError("Input to perform_ocr must be a numpy array")
        pil_image = Image.fromarray(image)
        text = pytesseract.image_to_string(pil_image)
        return text
    except Exception as e:
        logging.error(f"OCR error: {str(e)}")
        return ""

In [6]:
def classify_document(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions.argmax().item()

In [7]:
def extract_info(text):
    # This is a placeholder function. In a real-world scenario, you'd use a more sophisticated
    # named entity recognition model or rule-based system to extract specific information.
    info = {
        "text_length": len(text),
        "numeric_values": [num for num in text.split() if num.replace('.', '').isdigit()],
        "possible_dates": [word for word in text.split() if '/' in word or '-' in word]
    }
    return info

In [8]:

def main():
    base_dir = './archive'
    images, labels = load_images_from_directory(base_dir)
    
    if not images:
        logging.error("No valid images found. Please check your directory and image files.")
        return

    data = []
    le = LabelEncoder()
    encoded_labels = le.fit_transform(labels)

    for (filename, image), label, encoded_label in zip(images, labels, encoded_labels):
        logging.info(f"Processing image: {filename}")
        
        text = perform_ocr(image)
        logging.info(f"OCR result for {filename}: {text[:100]}...")  # Log the first 100 characters of each OCR result
        
        if text.strip():
            predicted_class = classify_document(text)
            info = extract_info(text)
            info['filename'] = filename
            info['true_label'] = label
            info['predicted_label'] = le.inverse_transform([predicted_class])[0]
            info['correct_prediction'] = (predicted_class == encoded_label)
            data.append(info)
        else:
            logging.warning(f"Empty text extracted from {filename}. Skipping this image.")

    if not data:
        logging.error("No valid texts extracted from images. Cannot proceed with classification.")
        return

    df = pd.DataFrame(data)
    accuracy = (df['correct_prediction'].sum() / len(df)) * 100
    logging.info(f"Model accuracy: {accuracy:.2f}%")

    logging.info("DataFrame head:")
    logging.info(df.head())
    
    df.to_csv('extracted_document_info.csv', index=False)
    logging.info("Data saved to 'extracted_document_info.csv'")

In [9]:
if __name__ == "__main__":
    main()

ERROR:root:Error loading ./archive\Bank Statement\50.jpg: Unable to read image: ./archive\Bank Statement\50.jpg
ERROR:root:Error loading ./archive\Check\70.jpg: Unable to read image: ./archive\Check\70.jpg
ERROR:root:Error loading ./archive\ITR_Form 16\34.jpg: Unable to read image: ./archive\ITR_Form 16\34.jpg
ERROR:root:Error loading ./archive\ITR_Form 16\37.jpg: Unable to read image: ./archive\ITR_Form 16\37.jpg
ERROR:root:Error loading ./archive\ITR_Form 16\39.jpg: Unable to read image: ./archive\ITR_Form 16\39.jpg
ERROR:root:Error loading ./archive\ITR_Form 16\42.jpg: Unable to read image: ./archive\ITR_Form 16\42.jpg
ERROR:root:Error loading ./archive\Salary Slip\23.jpg: Unable to read image: ./archive\Salary Slip\23.jpg
ERROR:root:Error loading ./archive\Salary Slip\41.jpg: Unable to read image: ./archive\Salary Slip\41.jpg
ERROR:root:Error loading ./archive\Utility\11.jpg: Unable to read image: ./archive\Utility\11.jpg
ERROR:root:Error loading ./archive\Utility\31.jpg: Unable to