In [40]:
import os

def classify_file(file_path):
    _, ext = os.path.splitext(file_path)
    if ext.lower() in ['.eml', '.msg']:
        return ext.lower()
    else:
        raise ValueError("Unsupported file type")

# Example usage
file_path = 'INV.eml'
file_type = classify_file(file_path)
print(f"File Type: {file_type}")

File Type: .eml


In [43]:
import email
from email import policy
from email.parser import BytesParser
import extract_msg

def parse_email(file_path, file_type):
    if file_type == '.eml':
        with open(file_path, 'rb') as f:
            msg = BytesParser(policy=policy.default).parse(f)
        metadata = {
            'from': msg['from'],
            'to': msg['to'],
            'subject': msg['subject'],
            'date': msg['date'],
            'content_type': msg.get_content_type()
        }
        return msg, metadata

    elif file_type == '.msg':
        msg = extract_msg.Message(file_path)
        metadata = {
            'from': msg.sender,
            'to': msg.to,
            'subject': msg.subject,
            'date': msg.date,
            'content_type': msg.message_type
        }
        return msg, metadata

# Example usage
msg, metadata = parse_email('INV.eml', '.eml')
metadata


{'from': 'SIVARAMAKRISHNAN SIVALINGAM <sivajaya2009@gmail.com>',
 'to': 'Sivaramakrishnan Sivalingam <sivajaya2009@gmail.com>',
 'subject': 'INV',
 'date': 'Sat, 03 Aug 2024 16:13:47 +0530',
 'content_type': 'multipart/alternative'}

In [44]:
def get_body_content(msg):
    if msg.is_multipart():
        for part in msg.iter_parts():
            if part.get_content_type() == 'text/plain':
                return part.get_payload(decode=True).decode()
    else:
        return msg.get_payload(decode=True).decode()

body_content = get_body_content(msg)
body_content

'Your Company Inc                        Bill To:\nCompany No : U-1423242                  Target Company Inc\n\nUnit 1, Lingkaran Syed Putra            Unit 999, Lingkaran Syed Putra\n\nMid Valley City                         Mid Valley City\n\n59200 Kuala Lumpur                      59200 Kuala Lumpur\n\nWilayah Persekutuan Kuala Lumpur        Wilayah Persekutuan Kuala Lumpur\nTel: 03-9876 5432                       Tel: 03-1234 567\n\nINVOICE\n\nInvoice No.: INV-42532622                   Invoice Date: 2021-05-23\n\nNo Description Quantity Unit Price, $ Price, $\n1 | Lorems ipsum 5 2.30 11.50\n2 | Consectetur adipiscing el 2 5.63 11.26\n3 | Quis autem vel eum i 3 3.63 10.89\n\nSubtotal 33.65     Tax Amount 1.68    Grand total 35.33\n\nNote: The tax invoice is computer generated and no signature is required.'

In [45]:
from fpdf import FPDF
from PIL import Image, ImageDraw, ImageFont

def email_body_to_image(email_body, image_path):
    font = ImageFont.load_default()
    lines = email_body.split('\n')
    width = max(font.getsize(line)[0] for line in lines) + 20
    height = len(lines) * (font.getsize(lines[0])[1] + 5) + 20
    
    image = Image.new('RGB', (width, height), color='white')
    draw = ImageDraw.Draw(image)
    
    y_text = 10
    for line in lines:
        draw.text((10, y_text), line, font=font, fill='black')
        y_text += font.getsize(line)[1] + 5
    
    image.save(image_path)

def email_body_to_pdf(email_body, pdf_path):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in email_body.split('\n'):
        pdf.cell(200, 10, txt=line, ln=True)
    pdf.output(pdf_path)

# Example usage
#email_body_to_image(body_content, 'email_body.png')
email_body_to_pdf(body_content, 'email_body.pdf')


In [4]:
import re

def preprocess_text(text):
    # Remove unnecessary characters and normalize text
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

preprocessed_text = preprocess_text(body_content)
print(preprocessed_text)


Invoice No.: INV-42532622 Invoice Date: 2021-05-23 No Description Quantity Unit Price, $ Price, $ 1 | Lorems ipsum 5 2.30 11.50 2 | Consectetur adipiscing el 2 5.63 11.26 3 | Quis autem vel eum i 3 3.63 10.89 Subtotal 33.65 Tax Amount 1.68 Grand total 35.33 Note: The tax invoice is computer generated and no signature is required. 


In [29]:
import tensorflow as tf
from transformers import TFAutoModelForTokenClassification, AutoTokenizer
import pdfplumber
import docx
import pandas as pd

model = TFAutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

def extract_entities(text):
    inputs = tokenizer(text, return_tensors="tf")
    outputs = model(inputs.data)
    logits = outputs.logits
    predicted_ids = tf.argmax(logits, axis=-1).numpy()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].numpy()[0])
    entities = [(token, idx) for token, idx in zip(tokens, predicted_ids[0])]
    return entities

# Example usage for email body
email_body = "Invoice No. 1234"
entities = extract_entities(email_body)
print(entities)


All PyTorch model weights were used when initializing TFBertForTokenClassification.

All the weights of TFBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


[('[CLS]', 0), ('In', 0), ('##vo', 0), ('##ice', 0), ('No', 0), ('.', 0), ('123', 0), ('##4', 0), ('[SEP]', 0)]


In [25]:
entities

[('[CLS]', 0),
 ('In', 0),
 ('##vo', 0),
 ('##ice', 0),
 ('No', 0),
 ('.', 0),
 (':', 0),
 ('IN', 0),
 ('##V', 0),
 ('-', 0),
 ('425', 0),
 ('##32', 0),
 ('##6', 0),
 ('##22', 0),
 ('In', 0),
 ('##vo', 0),
 ('##ice', 0),
 ('Date', 0),
 (':', 0),
 ('202', 0),
 ('##1', 0),
 ('-', 0),
 ('05', 0),
 ('-', 0),
 ('23', 0),
 ('No', 0),
 ('Des', 0),
 ('##cription', 0),
 ('Q', 0),
 ('##uant', 0),
 ('##ity', 0),
 ('Unit', 0),
 ('Price', 0),
 (',', 0),
 ('$', 0),
 ('Price', 0),
 (',', 0),
 ('$', 0),
 ('1', 0),
 ('|', 0),
 ('Lo', 0),
 ('##rem', 0),
 ('##s', 0),
 ('i', 0),
 ('##ps', 0),
 ('##um', 0),
 ('5', 0),
 ('2', 0),
 ('.', 0),
 ('30', 0),
 ('11', 0),
 ('.', 0),
 ('50', 0),
 ('2', 0),
 ('|', 0),
 ('Con', 0),
 ('##sect', 0),
 ('##et', 0),
 ('##ur', 0),
 ('ad', 0),
 ('##ip', 0),
 ('##is', 0),
 ('##cing', 0),
 ('el', 0),
 ('2', 0),
 ('5', 0),
 ('.', 0),
 ('63', 0),
 ('11', 0),
 ('.', 0),
 ('26', 0),
 ('3', 0),
 ('|', 0),
 ('Q', 0),
 ('##ui', 0),
 ('##s', 0),
 ('au', 0),
 ('##tem', 0),
 ('ve', 0),


In [11]:
import json

def generate_json_output(metadata, email_entities):
    output = {
        'from': metadata['from'],
        'to': metadata['to'],
        'subject': metadata['subject'],
        'Invoice No.': email_entities.get('INVOICE'),
        'Invoice Date': email_entities.get('DATE'),
        'Amount': email_entities.get('MONEY'),
    }
    return json.dumps(output, indent=4)

# Example usage
json_output = generate_json_output(metadata, email_entities)
print(json_output)


{
    "from": "SIVARAMAKRISHNAN SIVALINGAM <sivajaya2009@gmail.com>",
    "to": "Sivaramakrishnan Sivalingam <sivajaya2009@gmail.com>",
    "subject": "Invoice for Payment",
    "Invoice No.": null,
    "Invoice Date": "3 3.63 10.89",
    "Amount": null
}


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load a pre-trained model and tokenizer for NER
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a pipeline for NER
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Define the text string
text = "John Doe works at OpenAI and lives in San Francisco."

# Perform NER
entities = nlp(text)

# Process and print the entities
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Confidence: {entity['score']:.2f}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Entity: John, Label: I-PER, Confidence: 1.00
Entity: Do, Label: I-PER, Confidence: 1.00
Entity: ##e, Label: I-PER, Confidence: 0.99
Entity: Open, Label: I-ORG, Confidence: 1.00
Entity: ##A, Label: I-ORG, Confidence: 1.00
Entity: ##I, Label: I-ORG, Confidence: 1.00
Entity: San, Label: I-LOC, Confidence: 1.00
Entity: Francisco, Label: I-LOC, Confidence: 1.00


In [16]:
text = """
Your Company Inc                        Bill To:
Company No : U-1423242                  Target Company Inc

Unit 1, Lingkaran Syed Putra            Unit 999, Lingkaran Syed Putra

Mid Valley City                         Mid Valley City

59200 Kuala Lumpur                      59200 Kuala Lumpur

Wilayah Persekutuan Kuala Lumpur        Wilayah Persekutuan Kuala Lumpur
Tel: 03-9876 5432                       Tel: 03-1234 567

INVOICE

Invoice No.: INV-42532622                   Invoice Date: 2021-05-23

No Description Quantity Unit Price, $ Price, $
1 | Lorems ipsum 5 2.30 11.50
2 | Consectetur adipiscing el 2 5.63 11.26
3 | Quis autem vel eum i 3 3.63 10.89

Subtotal 33.65     Tax Amount 1.68    Grand total 35.33

Note: The tax invoice is computer generated and no signature is required.
"""


In [25]:
body_content

'Invoice No.: INV-42532622                   Invoice Date: 2021-05-23\n\nNo Description Quantity Unit Price, $ Price, $\n1 | Lorems ipsum 5 2.30 11.50\n2 | Consectetur adipiscing el 2 5.63 11.26\n3 | Quis autem vel eum i 3 3.63 10.89\n\nSubtotal 33.65     Tax Amount 1.68    Grand total 35.33\n\nNote: The tax invoice is computer generated and no signature is required.\n'

In [26]:
body_content.replace('  ',' ').replace('\t',' ').replace('\n',' ')

'Invoice No.: INV-42532622          Invoice Date: 2021-05-23  No Description Quantity Unit Price, $ Price, $ 1 | Lorems ipsum 5 2.30 11.50 2 | Consectetur adipiscing el 2 5.63 11.26 3 | Quis autem vel eum i 3 3.63 10.89  Subtotal 33.65   Tax Amount 1.68  Grand total 35.33  Note: The tax invoice is computer generated and no signature is required. '

In [36]:
new_text = ' '.join(re.sub(r'\s+', ' ', text).strip().split())

In [37]:
new_text

'Your Company Inc Bill To: Company No : U-1423242 Target Company Inc Unit 1, Lingkaran Syed Putra Unit 999, Lingkaran Syed Putra Mid Valley City Mid Valley City 59200 Kuala Lumpur 59200 Kuala Lumpur Wilayah Persekutuan Kuala Lumpur Wilayah Persekutuan Kuala Lumpur Tel: 03-9876 5432 Tel: 03-1234 567 INVOICE Invoice No.: INV-42532622 Invoice Date: 2021-05-23 No Description Quantity Unit Price, $ Price, $ 1 | Lorems ipsum 5 2.30 11.50 2 | Consectetur adipiscing el 2 5.63 11.26 3 | Quis autem vel eum i 3 3.63 10.89 Subtotal 33.65 Tax Amount 1.68 Grand total 35.33 Note: The tax invoice is computer generated and no signature is required.'

In [8]:
import re
import json

def extract_information(text):
    data = {}

    # Extract company number
    company_no_match = re.search(r'Company No\s*:\s*(U-\d+)', text)
    if company_no_match:
        data['Company No'] = company_no_match.group(1)

    # Extract invoice number and date
    invoice_info_match = re.search(r'Invoice No\.\s*:\s*(INV-\d+)\s+Invoice Date:\s*([\d-]+)', text)
    if invoice_info_match:
        data['Invoice No'] = invoice_info_match.group(1)
        data['Invoice Date'] = invoice_info_match.group(2)

    # Extract seller address (text below "Your Company Inc" up to "Bill To:")
    seller_address_match = re.search(r'Your Company Inc\s+(.*?)\s+Bill To:', text, re.DOTALL)
    if seller_address_match:
        data['Seller Address'] = seller_address_match.group(1).strip()

    # Extract buyer address (text below "Bill To:" up to "INVOICE")
    buyer_address_match = re.search(r'Bill To:\s+(.*?)\s+INVOICE', text, re.DOTALL)
    if buyer_address_match:
        data['Buyer Address'] = buyer_address_match.group(1).strip()

    # Extract Subtotal
    subtotal_match = re.search(r'Subtotal\s+(\d+\.\d{2})', text)
    if subtotal_match:
        data['Subtotal'] = float(subtotal_match.group(1))

    # Extract Tax Amount
    tax_amount_match = re.search(r'Tax Amount\s+(\d+\.\d{2})', text)
    if tax_amount_match:
        data['Tax Amount'] = float(tax_amount_match.group(1))

    # Extract Grand Total
    grand_total_match = re.search(r'Grand total\s+(\d+\.\d{2})', text)
    if grand_total_match:
        data['Grand Total'] = float(grand_total_match.group(1))

    return data


# Extract information
extracted_data = extract_information(text)

# Convert to JSON
json_data = json.dumps(extracted_data, indent=4)

print(json_data)


{
    "Company No": "U-1423242",
    "Invoice No": "INV-42532622",
    "Invoice Date": "2021-05-23",
    "Seller Address": "",
    "Buyer Address": "Company No : U-1423242                  Target Company Inc  Unit 1, Lingkaran Syed Putra            Unit 999, Lingkaran Syed Putra  Mid Valley City                         Mid Valley City  59200 Kuala Lumpur                      59200 Kuala Lumpur  Wilayah Persekutuan Kuala Lumpur        Wilayah Persekutuan Kuala Lumpur Tel: 03-9876 5432                       Tel: 03-1234 567",
    "Subtotal": 33.65,
    "Tax Amount": 1.68,
    "Grand Total": 35.33
}


In [1]:
import re

def extract_table_with_headers(text):
    # Define a regular expression pattern to match the header and rows
    header_pattern = re.compile(
        r'No\s*Description\s*Quantity\s*Unit Price, \$\s*Price, \$'
    )
    
    # Define a regular expression pattern to match the table rows
    row_pattern = re.compile(
        r'(\d+)\s*\|\s*(.+?)\s*(\d+)\s*([\d.]+)\s*([\d.]+)'
    )
    
    # Search for the header
    header_match = header_pattern.search(text)
    
    # Extract the column names from the header
    column_names = []
    if header_match:
        column_names = [name.strip() for name in header_match.group(0).split()]
    
    # Find all matches for table rows
    row_matches = row_pattern.findall(text)
    
    # Define a list to store the extracted table data
    table_data = []
    
    # Process each match and append to the table data list
    for match in row_matches:
        row = {
            column_names[0]: match[0],
            column_names[1]: match[1],
            column_names[2]: match[2],
            column_names[3]: match[3],
            column_names[4]: match[4]
        }
        table_data.append(row)
    
    return column_names, table_data

# The provided text
text = """
Your Company Inc Bill To: Company No : U-1423242 Target Company Inc Unit 1, Lingkaran Syed Putra Unit 999, Lingkaran Syed Putra Mid Valley City Mid Valley City 59200 Kuala Lumpur 59200 Kuala Lumpur Wilayah Persekutuan Kuala Lumpur Wilayah Persekutuan Kuala Lumpur Tel: 03-9876 5432 Tel: 03-1234 567 INVOICE Invoice No.: INV-42532622 Invoice Date: 2021-05-23 No Description Quantity Unit Price, $ Price, $ 1 | Lorems ipsum 5 2.30 11.50 2 | Consectetur adipiscing el 2 5.63 11.26 3 | Quis autem vel eum i 3 3.63 10.89 Subtotal 33.65 Tax Amount 1.68 Grand total 35.33 Note: The tax invoice is computer generated and no signature is required.
"""

# Extract column names and table data
column_names, table = extract_table_with_headers(text)
print("Column Names:", column_names)
for row in table:
    print(row)


Column Names: ['No', 'Description', 'Quantity', 'Unit', 'Price,', '$', 'Price,', '$']
{'No': '1', 'Description': 'Lorems ipsum', 'Quantity': '5', 'Unit': '2.30', 'Price,': '11.50'}
{'No': '2', 'Description': 'Consectetur adipiscing el', 'Quantity': '2', 'Unit': '5.63', 'Price,': '11.26'}
{'No': '3', 'Description': 'Quis autem vel eum i', 'Quantity': '3', 'Unit': '3.63', 'Price,': '10.89'}


In [27]:
import re

def format_invoice(text):
    # Split the text into lines
    lines = text.split(' ')
    
    # Define the blocks to identify
    blocks = {
        "company_info": ["Your Company Inc", "Bill To:", "Company No :", "Target Company Inc", "Unit 1,", "Unit 999,", "Mid Valley City", "59200 Kuala Lumpur", "Wilayah Persekutuan Kuala Lumpur", "Tel:"],
        "invoice_info": ["INVOICE", "Invoice No.:", "Invoice Date:"],
        "item_info": ["No", "Description", "Quantity", "Unit Price,", "Price,"],
        "totals": ["Subtotal", "Tax Amount", "Grand total"],
        "note": ["Note:"]
    }
    
    formatted_text = ""
    current_block = None
    
    for line in lines:
        for block, keywords in blocks.items():
            if any(keyword in line for keyword in keywords):
                if current_block != block:
                    formatted_text += "\n" if formatted_text else ""
                    current_block = block
                break
        formatted_text += line + " "
    
    return formatted_text.strip()

text = """Your Company Inc Bill To: Company No : U-1423242 Target Company Inc Unit 1, Lingkaran Syed Putra Unit 999, Lingkaran Syed Putra Mid Valley City Mid Valley City 59200 Kuala Lumpur 59200 Kuala Lumpur Wilayah Persekutuan Kuala Lumpur Wilayah Persekutuan Kuala Lumpur Tel: 03-9876 5432 Tel: 03-1234 567 INVOICE Invoice No.: INV-42532622 Invoice Date: 2021-05-23 No Description Quantity Unit Price, $ Price, $ 1 | Lorems ipsum 5 2.30 11.50 2 | Consectetur adipiscing el 2 5.63 11.26 3 | Quis autem vel eum i 3 3.63 10.89 Subtotal 33.65 Tax Amount 1.68 Grand total 35.33 Note: The tax invoice is computer generated and no signature is required."""

formatted_text = format_invoice(text)
print(formatted_text)


Your Company Inc Bill To: Company 
No : U-1423242 Target Company Inc Unit 1, Lingkaran Syed Putra Unit 999, Lingkaran Syed Putra Mid Valley City Mid Valley City 59200 Kuala Lumpur 59200 Kuala Lumpur Wilayah Persekutuan Kuala Lumpur Wilayah Persekutuan Kuala Lumpur 
Tel: 03-9876 5432 Tel: 03-1234 567 
INVOICE Invoice 
No.: INV-42532622 Invoice Date: 2021-05-23 No Description Quantity Unit Price, $ Price, $ 1 | Lorems ipsum 5 2.30 11.50 2 | Consectetur adipiscing el 2 5.63 11.26 3 | Quis autem vel eum i 3 3.63 10.89 
Subtotal 33.65 Tax Amount 1.68 Grand total 35.33 
Note: The tax invoice is computer generated and no signature is required.


In [23]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sivaramakrishnan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True