In [4]:
import os
import pandas as pd
from src.pdf_processor import extract_text_from_pdf, is_scanned_pdf
from src.ocr_processor import ocr_from_pdf
from src.data_extractor import extract_all_fields
from src.accuracy_checker import validate_fields
from src.trust_determiner import determine_trust
import logging

In [38]:
import re

def extract_total_amount(text):
    # Regex pattern to extract the total amount
    pattern = r'(?:Total|TOTAL|Grand Total|TOTAL AMOUNT)?\s*₹?\s*([\d,]+\.\d{2}|\d{1,3}(?:,\d{3})*)'
    
    # Search for the pattern in the input text
    match = re.search(pattern, text)
    
    if match:
        # Extract the matched amount (remove commas for conversion to float)
        amount_str = match.group(1).replace(',', '')
        # Convert to float and return the amount
        return float(amount_str)
    else:
        # Return None if no amount was found
        return None

amount = extract_total_amount('Total₹5,791.00 asfaerert Total₹10,791.00')

print(amount)



5791.0


In [15]:
def process_invoice(pdf_path):
    try:
        logging.info(f"Processing file: {pdf_path}")
        if is_scanned_pdf(pdf_path):
            logging.info("Detected as scanned PDF. Using OCR.")
            text = ocr_from_pdf(pdf_path)
        else:
            logging.info("Detected as text PDF. Extracting text directly.")
            text = extract_text_from_pdf(pdf_path)
        
        data = extract_all_fields(text)
        print(data)
        validations = validate_fields(data)
        trust = determine_trust(validations)
        
        return {
            'file': os.path.basename(pdf_path),
            'data': data,
            'validations': validations,
            'trust': trust
        }
    except Exception as e:
        logging.error(f"Failed to process {pdf_path}: {e}")
        return {
            'file': os.path.basename(pdf_path),
            'data': None,
            'validations': None,
            'trust': None,
            'error': str(e)
        }

invoice_dir = 'data/sample_invoices/'
results = []

for filename in os.listdir(invoice_dir):
    if filename.lower().endswith('.pdf'):
        pdf_path = os.path.join(invoice_dir, filename)
        result = process_invoice(pdf_path)
        results.append(result)

# Convert results to DataFrame
df = pd.json_normalize(results)

{'invoice_number': None, 'invoice_date': None, 'total_amount': None, 'line_items': []}
{'invoice_number': None, 'invoice_date': None, 'total_amount': None, 'line_items': []}


In [35]:
import re

def extract_final_total_amount(text):
    """
    Extracts the final total amount from the given text.

    Parameters:
        text (str): The input text containing invoice details.

    Returns:
        float or None: The extracted total amount as a float, or None if not found.
    """
    # Step 1: Define a regex pattern to find all 'Total' occurrences
    total_pattern = re.compile(r'Total(?:\s+\d+)?', re.IGNORECASE)

    # Step 2: Find all matches of 'Total'
    totals = list(total_pattern.finditer(text))

    if totals:
        # Step 3: Focus on the last 'Total' occurrence
        last_total = totals[-1]

        # Step 4: Extract the substring after the last 'Total'
        after_total = text[last_total.end():]

        # Step 5: Define a regex pattern to find all amounts after 'Total'
        amount_pattern = re.compile(r'[₹$]?\s*([\d,]+\.\d{2})')

        # Step 6: Find all amounts in the substring
        amounts = amount_pattern.findall(after_total)

        if amounts:
            # Step 7: Convert extracted amounts to floats after removing commas
            amounts = [float(amount.replace(',', '')) for amount in amounts]
            print(amounts)
            # Option 1: Return the **last** amount as the final total
            # final_total = amounts[-1]

            # Option 2: Alternatively, return the **largest** amount
            final_total = max(amounts)

            return final_total

    # If 'Total' not found or no amounts found after 'Total'
    return None

# Example Usage
sample_text = """
Campos Technologies
F25-26, 1st Floor, Alfran Plaza, Opp. Naturals Ice Cream,
Nr. Don Bosco School, M.G Road, Panaji Goa 403001
Phone no. : 9145511772
Email : campostechgoa@gmail.com
GSTIN : 30BHLPC5928C2ZS
State: 30-Goa
Invoice
Bill To
HIMTIK DESIGN
H NO 3/6 CHAMUNDA RESIDENCY CARANZALEM
CARANZALEM
GSTIN : 30AAJFH3702R1ZB
State: 30-Goa
Invoice Details
Invoice No. : CT/2024-25/15
Date : 03-07-2024
Place of supply: 30-Goa
# Item name HSN/
SAC MRP Quantity Unit Price/
Unit GST Amount
1
Benq GP100 LED
Projector
(Warranty - 2 Years)
8528 ₹
1,25,000.00 1 Pcs ₹
67,203.39
₹
12,096.61
(18%)
₹
79,300.00
2
Motorized Projector
Screen
(1 Year Warranty. Aspect Ratio
16:9, 100" inch)
9011 ₹
21,999.00 1 Pcs ₹
10,000.00
₹ 1,800.00
(18%)
₹
11,800.00
Total 2 ₹
13,896.61
₹
91,100.00
"""

# Extract the final total amount
total_amount = extract_final_total_amount(sample_text)

if total_amount is not None:
    print(f"Extracted Total Amount: {total_amount}")
else:
    print("No valid total amount found.")


[13896.61, 91100.0]
Extracted Total Amount: 91100.0


In [39]:
import re
from datetime import datetime

def convert_to_dd_mm_yyyy(date_str):
    """
    Convert various date formats to DD-MM-YYYY.

    Parameters:
        date_str (str): The date string to convert.

    Returns:
        str: The date in DD-MM-YYYY format, or None if conversion fails.
    """
    # Try different date formats for conversion
    for fmt in ("%d-%m-%Y", "%d/%m/%Y", "%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%d-%b-%y", "%d/%m/%y"):
        try:
            date_obj = datetime.strptime(date_str, fmt)
            return date_obj.strftime("%d-%m-%Y")
        except ValueError:
            continue
    return None

def extract_single_invoice_date(text):
    """
    Extracts the first invoice date from the given text and returns it in DD-MM-YYYY format.

    Parameters:
        text (str): The input text containing invoice details.

    Returns:
        str or None: The extracted date in DD-MM-YYYY format, or None if not found.
    """
    # Define the regex pattern for various date formats
    pattern = r'(?:Invoice Date|Date):?\s*(\d{1,2}[-/ ]\d{1,2}[-/ ]\d{2,4}|\d{1,2}[-/ ](?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[-/ ]\d{2,4}|\d{1,2}[-/ ](?:January|February|March|April|May|June|July|August|September|October|November|December)[-/ ]\d{2,4}|\d{4}[-/ ]\d{1,2}[-/ ]\d{1,2})'
    
    # Find all matches in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    # Convert the first match to DD-MM-YYYY format
    if matches:
        return convert_to_dd_mm_yyyy(matches[0])
    
    return None

# Test examples
invoice_texts = [
    "Invoice Date: 15 Feb 2024",
    "Date: 02-07-2024",
    "Date 2024-07-10",
    "29-Jul-24",
    "Invoice Date : 21/07/2024",
    "Invoice Date: 15/08/2024",
    "Date: 10/07/2024",
    "Invoice Date: 1-JULY-2024",
    "Date : 08-07-2024",
    "Invoice Date:\n18/05/2024",
    "Invoice Date : 16/07/2024"
]

for text in invoice_texts:
    date = extract_single_invoice_date(text)
    print(f"Extracted Date from '{text}': {date}")


Extracted Date from 'Invoice Date: 15 Feb 2024': 15-02-2024
Extracted Date from 'Date: 02-07-2024': 02-07-2024
Extracted Date from 'Date 2024-07-10': 10-07-2024
Extracted Date from '29-Jul-24': None
Extracted Date from 'Invoice Date : 21/07/2024': None
Extracted Date from 'Invoice Date: 15/08/2024': 15-08-2024
Extracted Date from 'Date: 10/07/2024': 10-07-2024
Extracted Date from 'Invoice Date: 1-JULY-2024': None
Extracted Date from 'Date : 08-07-2024': None
Extracted Date from 'Invoice Date:
18/05/2024': 18-05-2024
Extracted Date from 'Invoice Date : 16/07/2024': None


In [48]:
import re

def extract_invoice_numbers(text):
    """
    Extracts invoice numbers from the given text.

    Parameters:
        text (str): The input text containing invoice details.

    Returns:
        list: A list of extracted invoice numbers.
    """
    # Define the regex pattern
    text = re.sub(r'[\n\t]+', ' ', text)
    pattern = r'(?:Invoice No\.?|Invoice Number|Invoice #:?|#)\s*[:\-]?\s*([A-Z0-9\-\/]+)'
    
    # Find all matches in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    
    return matches

# Test examples
invoice_texts = [
    "Invoice No.: 614",
    "Invoice No.: OD14414",
    "# : INV-802",
    "Invoice No 657",
    "Invoice No.: 216",
    "Invoice No. : CT/2024-25/19",
    "Invoice #: INV-73",
    "Invoice No.: 216",
    "Invoice No 648",
    "Invoice Number: RA29/OD-2",
    "Invoice No. : CT/2024-25/17",
    '''Invoice	No
648'''
]

for text in invoice_texts:
    invoice_numbers = extract_invoice_numbers(text)
    print(f"Extracted Invoice Numbers from '{text}': {invoice_numbers}")


Extracted Invoice Numbers from 'Invoice No.: 614': ['614']
Extracted Invoice Numbers from 'Invoice No.: OD14414': ['OD14414']
Extracted Invoice Numbers from '# : INV-802': ['INV-802']
Extracted Invoice Numbers from 'Invoice No 657': ['657']
Extracted Invoice Numbers from 'Invoice No.: 216': ['216']
Extracted Invoice Numbers from 'Invoice No. : CT/2024-25/19': ['CT/2024-25/19']
Extracted Invoice Numbers from 'Invoice #: INV-73': ['INV-73']
Extracted Invoice Numbers from 'Invoice No.: 216': ['216']
Extracted Invoice Numbers from 'Invoice No 648': ['648']
Extracted Invoice Numbers from 'Invoice Number: RA29/OD-2': ['RA29/OD-2']
Extracted Invoice Numbers from 'Invoice No. : CT/2024-25/17': ['CT/2024-25/17']
Extracted Invoice Numbers from 'Invoice	No
648': ['648']


In [45]:
import pandas as pd
import re

def extract_invoice_table(text):
    """
    Extracts table-like structures from the given text and returns it as a DataFrame.

    Parameters:
        text (str): The input text containing invoice details.

    Returns:
        DataFrame: A pandas DataFrame containing the extracted table data, or None if no table is found.
    """
    # Split the text into lines
    lines = text.strip().split('\n')
    
    # Clean and filter out any empty lines
    lines = [line.strip() for line in lines if line.strip()]

    # Prepare a list to hold item data
    item_data = []

    # Pattern to match item lines (assuming item lines have a number followed by a description, quantity, and amount)
    item_pattern = re.compile(r'(\d+)\s+([\d\s\w\(\)-]+)\s+(\d+)\s+([\d,]+\.\d{2})')

    # Iterate through each line to find item rows
    for line in lines:
        match = item_pattern.search(line)
        if match:
            # Extract item details
            item_number = match.group(1)
            description = match.group(2).strip()
            quantity = match.group(3)
            amount = match.group(4).replace(',', '')  # Remove commas from amount

            # Append extracted data to item_data
            item_data.append({
                'Item No': item_number,
                'Description': description,
                'Quantity': quantity,
                'Amount': float(amount)
            })

    # Create a DataFrame from the extracted item data
    if item_data:
        df = pd.DataFrame(item_data)
        return df
    else:
        return None

# Test example with provided data
invoice_text = """
Invoice. No.& Date
SHP/39/24-2025
29-Jul-24
Contact Person
KARTHICK
9345057059
1 85369090 200Set 9,800.00
2 392390 60 Pkd 10,320.00
3 3923 200 Pkd 17,000.00
4 94054200 2 Nos 4,230.00
41,350.00
7443
48,793.00
0
48,793.00
Authorised Signatory
Shipping Address
EDISON ENERGY INDIA PVT LTD
SF No :4/1 ,4/2 ,4/3 Kongudipatty Village
Pudukotai District ,Tamilnadu
Illuppur TK ,Post 622102
MC4 connector 1000V ,30 A
INVOICE
SRI HARI ENTERPRISES
5/252 Plot 80 1st floor
Sudaruli Nagar
Billing Address
Thalambur, Chennai 600130
GST IN : 33CKBPS7055L1ZS
State Name : Tamil Nadu, Code : 33
E-Mail : Thenkilakku2019@gmail.com
EDISON ENERGY INDIA PVT LTD
Suite No :3 ,2 nd Floor ,Sun Plaza
G.N.Chetty Road ,Chennai 600006
15 MW Solar Power Plant
cable tie - 12inch
cable tie - 6inch
Hawells Flood light 60 W
GST 18%
Total
Grand Total
Round off
Bank Details:
Name :SRI HARI ENTERPRISE
Bank : Canara bank
Account number : 120002029065
IFSC code : CNRB0016495
Payment terms :
1. 100% advance payment along with the purchase order.
2. Delivery Time : 10 To 15 days.
Supply Items :
SI.No QTY
GST IN : 33AADCE9497R2ZN
Old No : 19 ,New No :39
TOTAL AMOUNDESCRIPTION HSN CODE T
Final Net Amount
Amount in worlds: Forty Eight thousand seven hundred and Ninety three only
for SriHari Enterprise
"""

# Extracting the invoice items into a DataFrame
df_items = extract_invoice_table(invoice_text)
print(df_items)


None


In [50]:
import re
import PyPDF2
import pandas as pd

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text() + '\n'  # Add each page's text to the overall text
    return text

# Function to extract all invoice numbers and their corresponding text
def extract_invoices(text):
    # Define the regex pattern for finding invoice numbers
    pattern = r'(?:Invoice No\.?|Invoice Number|Invoice #:?|#)\s*[:\-]?\s*([A-Z0-9\-\/]+)'
    
    # Find all matches in the text
    matches = re.finditer(pattern, text, re.IGNORECASE)
    
    # Create a list to hold invoice numbers and their corresponding text
    invoice_list = []
    previous_index = 30
    for match in matches:
        invoice_number = match.group(1)
        # Extracting text around the invoice number (for simplicity, let's just take a certain number of characters before and after)
        start_index = max(0, previous_index - 30)  # 30 characters before
        end_index = min(len(text), match.end() + 30)  # 30 characters after
        invoice_text = text[start_index:end_index].strip()  # Get the corresponding text
        
        # Append a tuple of invoice number and text to the list
        invoice_list.append((invoice_number, invoice_text))

        previous_index = end_index
    
    return invoice_list

# Path to your PDF
pdf_path = 'data/sample_invoices/GST Sales July 24.pdf'

# Extract text from PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Extract invoice numbers and their corresponding text
invoices = extract_invoices(pdf_text)

# Create a DataFrame from the invoice list
df_invoices = pd.DataFrame(invoices, columns=['Invoice Number', 'Invoice Text'])

# Display the DataFrame
df_invoices.head()

# Optionally, save the DataFrame to a CSV file
# df_invoices.to_csv('invoices.csv', index=False)


Unnamed: 0,Invoice Number,Invoice Text
0,216,CI8139E1ZH\nState: 29-KarnatakaInvoice No.: 21...
1,Item,Place of Supply: 29-Karnataka\n#Item name HSN/...
2,217,71A1ZR\nState: 27-Mahar ashtr aInvoice No.: 21...
3,Item,e of Supply: 27-Mahar ashtr a\n#Item name HSN/...
4,218,57H1ZI\nState: 27-Mahar ashtr aInvoice No.: 21...


In [55]:
data1 = {
    "column 1": 1,
    "column 2": 2,
    "column 3": 3,
    "column 4": 4
}

data2 = {
    "column 1": 11,
    "column 2": 12,
    "column 3": 13,
    "column 4": 14
}

combined_data = [
    data1,  # First row of data from data1
    data2   # Second row of data from data2
]
df = pd.json_normalize(combined_data)
print(df)

   column 1  column 2  column 3  column 4
0         1         2         3         4
1        11        12        13        14


In [60]:
from PyPDF2 import PdfReader


def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    # Create a list to hold the text of each page
    text_list = []
    for page in reader.pages:
            text = page.extract_text()  # Extract text from the page
            if text:  # Check if there is any text extracted
                text_list.append(text)  # Add the text to the list
    return text_list

data= extract_text_from_pdf('data/sample_invoices/GST Sales July 24.pdf')

data

["9049337337\nxcellentmadha v1@gmail\n.com\nShop No 1, L unkad\nPlaza, Nr BSNL Oﬃce,\nViman Nagar , Pune\nXcellent Xerox And Online Services\nGSTIN: 27AJYP J9896Q1Z G\nState: 27-Mahar ashtr aTax Invoice\nBill To\nConvergint India Pvt Ltd\n2nd Floor , Prestige A trium, No.303 and 304\nNo.1, Centr al Str eet, Shiv ajinagar\nContact No.: 7415861269\nGSTIN Number: 29AABCI8139E1ZH\nState: 29-KarnatakaInvoice No.: 216\nDate: 02-07-2024\nPlace of Supply: 29-Karnataka\n#Item name HSN/ SAC Quantity Price/ unit GST Amount\n1A4 Black & White 48211090 929 ₹ 1.50 ₹ 250.83 (18.0%) ₹ 1,644.33\n2A4 Colour 48211090 67 ₹ 10.00 ₹ 120.60 (18.0%) ₹ 790.60\n3A1 Color 48211090 4 ₹ 150.00 ₹ 108.00 (18.0%) ₹ 708.00\n4Ring Box File 481960 2 ₹ 90.00 ₹ 32.40 (18.0%) ₹ 212.40\n5File Saperator 48201010 1 ₹ 100.00 ₹ 18.00 (18.0%) ₹ 118.00\n6Stamp 35069190 1 ₹ 250.00 ₹ 45.00 (18.0%) ₹ 295.00\nTotal 1004 ₹ 574.83 ₹ 3,768.33\nPay To:\nBank Name: Y es Bank, Amanor a Pune\nBank Account No.: 058563400001177\nBank IFSC cod

In [3]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Example invoice text
invoice_text = """

"""

def extract_total_amount(text):
    # Apply spaCy's NER to the text
    doc = nlp(text)
    
    total_amount = None
    # Iterate over the recognized entities
    for ent in doc.ents:
        # Check for 'MONEY' entities (which include monetary amounts)
        if ent.label_ == "MONEY":
            # Since there could be multiple monetary values, we look for keywords like "Total"
            if "total" in text.lower():
                total_amount = ent.text

    return total_amount

# Extract the total amount
total = extract_total_amount(invoice_text)
print(f"Total Amount: {total}")


Total Amount: None
