In [316]:
import os
import pandas as pd
import numpy as np
import logging
from PyPDF2 import PdfReader

In [317]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    # Create a list to hold the text of each page
    text_list = []
    for page in reader.pages:
            text = page.extract_text()  # Extract text from the page
            if text:  # Check if there is any text extracted
                text_list.append(text)  # Add the text to the list
    return text_list

In [318]:
def is_scanned_pdf(pdf_path):
    # Simple heuristic: if PyPDF2 cannot extract text, assume it's scanned
    text = extract_text_from_pdf(pdf_path)
    return len(text[0].strip()) == 0

In [319]:
import pytesseract
import cv2
from pdf2image import convert_from_path

# Setting pytesseract path
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

def ocr_from_pdf(pdf_path):
    # Convert PDF to images
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        # Preprocess image for better OCR accuracy
        open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        # You can add more preprocessing steps here (e.g., thresholding, denoising)
        text += pytesseract.image_to_string(gray) + "\n"
    return text

In [320]:
import re
import spacy
from datetime import datetime

nlp = spacy.load("en_core_web_sm")

def extract_invoice_number(text):
    # Example regex for invoice number
    # Open the file in write mode ('w') and write the text to it
    with open('output.txt', 'w', encoding='utf-8') as file:
        file.write(text)
    text = re.sub(r'[\n\t]+', ' ', text)
    match = re.search(r'(?:Invoice No\.?|Invoice Number|Invoice #:?|#)\s*[:\-]?\s*([A-Z0-9\-\/]+)', text, re.IGNORECASE)
    return match.group(1) if match else None

def convert_to_dd_mm_yyyy(date_str):
    """
    Convert various date formats to DD-MM-YYYY.

    Parameters:
        date_str (str): The date string to convert.

    Returns:
        str: The date in DD-MM-YYYY format, or None if conversion fails.
    """
    # Try different date formats for conversion
    for fmt in ("%d-%m-%Y", "%d/%m/%Y", "%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%d-%b-%y", "%d/%m/%y"):
        try:
            date_obj = datetime.strptime(date_str, fmt)
            return date_obj.strftime("%d-%m-%Y")
        except ValueError:
            continue
    return None

def extract_invoice_date(text):
    # Example regex for date in format DD/MM/YYYY or similar
    match = re.search(r'(?:Invoice Date|Date)\s*[:\-]?\s*(\d{1,2}[-/ ]\d{1,2}[-/ ]\d{2,4}|\d{1,2}[-/ ](?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[-/ ]\d{2,4}|\d{1,2}[-/ ](?:January|February|March|April|May|June|July|August|September|October|November|December)[-/ ]\d{2,4}|\d{4}[-/ ]\d{1,2}[-/ ]\d{1,2})', text, re.IGNORECASE)
    return convert_to_dd_mm_yyyy(match[1]) if match else None

def extract_total_amount(text):
    # Example regex for total amount
    match = re.search(r'(?:Total|TOTAL|Grand Total|TOTAL AMOUNT)\s*₹?\s*([\d]{1,2}(?:,\d{2,3})*(?:\.\d{2})?)', text, re.IGNORECASE)
    return match.group(1).replace(',', '') if match else None

In [321]:
def extract_all_fields(text):
    data = {
        'invoice_number': extract_invoice_number(text),
        'invoice_date': extract_invoice_date(text),
        'total_amount': extract_total_amount(text),
        # 'line_items': extract_line_items(text)
    }
    return data

In [322]:
def process_invoice(pdf_path):
    try:
        logging.info(f"Processing file: {pdf_path}")
        if is_scanned_pdf(pdf_path):
            logging.info("Detected as scanned PDF. Using OCR.")
            text = ocr_from_pdf(pdf_path)
            textlist = [text]
            for text in textlist:
                text.replace('\n', ' ')
        else:
            logging.info("Detected as text PDF. Extracting text directly.")
            textlist = extract_text_from_pdf(pdf_path)
        
        
        merged_data  = {}
        for text in textlist:
            data = extract_all_fields(text)
            for key, value in data.items():
                if key in merged_data:
                    # If the key already exists, append the new value (into a list)
                    if isinstance(merged_data[key], list):
                        merged_data[key].append(value)
                    else:
                        merged_data[key] = [merged_data[key], value]
                else:
                    # If the key doesn't exist, add it to the dictionary
                    merged_data[key] = value
        
        if len(textlist) > 1:        
            df = pd.DataFrame(merged_data)
        else:
            df = pd.DataFrame([merged_data])
        
        df['file'] = os.path.basename(pdf_path)
        duplicated_files = df['file'].duplicated(keep=False)
        df = df[~(duplicated_files & df['invoice_number'].isna())]
        
        # validations = validate_fields(data)
        # trust = determine_trust(validations)
        
        return df
        # return True
    except Exception as e:
        logging.error(f"Failed to process {pdf_path}: {e}")
        return {
            'file': os.path.basename(pdf_path),
            'data': None,
            'validations': None,
            'trust': None,
            'error': str(e)
        }

In [323]:
invoice_dir = 'data/sample_invoices/'
results = pd.DataFrame()

for filename in os.listdir(invoice_dir):
    if filename.lower().endswith('.pdf'):
        pdf_path = os.path.join(invoice_dir, filename)
        result = process_invoice(pdf_path)
        results = pd.concat([results, result])
# # print(results)
results.to_csv('extracted_data/invoices_extracted.csv', index=False)
logging.info("Extraction completed. Results saved to extracted_data/invoices_extracted.csv")