In [1]:
from transformers import RobertaTokenizer, RobertaForTokenClassification, pipeline

# Load the pre-trained RoBERTa tokenizer and model for token classification
tokenizer = RobertaTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = RobertaForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

# Alternatively, you can use the Hugging Face pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple",device="mps")


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Example text
text = "The invoice number is INV12345 and the total amount is $1500. Dubai pay on 10th May 2022 pay to account number 1234567890."

# Use the pipeline to extract entities
entities = ner_pipeline(text)

# Print out the entities
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Confidence: {entity['score']:.2f}")


Entity:  Dubai, Label: LOC, Confidence: 1.00


In [12]:
import re
from typing import Dict, List

def extract_entities(text: str) -> Dict[str, str]:
    # Define regex patterns for extraction
    patterns = {
        'Invoice Number': r'(?:Invoice Number|Invoice ID|Invoice number):?\s*([A-Z0-9-]+)',
        'Invoice Amount': r'Rs\s*(\d+)(?:\s*(?:for|invoice|amount))?',
        'Tax Amount': r'tax\s*of?\s*Rs\s*(\d+)',
        'Total Amount': r'total amount\s*of?\s*Rs\s*(\d+)|total payment due is Rs\s*(\d+)|Pay Rs\s*(\d+)',
        'Date': r'dated\s*(\d{1,2}\s\w+\s\d{4})|The date is (\d{1,2}\s\w+\s\d{4})'
    }
    
    entities = {}
    
    for entity, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            # Extract the first group from the match
            entities[entity] = match.group(1) if match.group(1) else match.group(2)
    
    return entities

# Example texts
texts = [
    "Please pay the total amount of Rs 120, which includes the invoice amount of Rs 100 and tax of Rs 10, for Invoice Number INV-12345 dated 20 July 2024, to Siva & Co. The payment should be made to account number 12345678 at ABC Bank, Chennai.",
    "The total payment due is Rs 150. This includes Rs 120 for the invoice and Rs 30 for tax. Invoice ID is INV-67890 dated 15 August 2024. Account 87654321 at XYZ Bank, Mumbai.",
    "Pay Rs 200 total. Invoice was Rs 150 and tax Rs 50. Invoice number: INV-99999. The date is 01 September 2024. Account 98765432, Bank: DEF Bank, Bangalore."
]

# Extract entities from each text
for i, text in enumerate(texts):
    print(f"Text {i + 1}:")
    entities = extract_entities(text)
    for key, value in entities.items():
        print(f"  {key}: {value}")
    print()


Text 1:
  Invoice Number: INV-12345
  Invoice Amount: 120
  Tax Amount: 10
  Total Amount: 120
  Date: 20 July 2024

Text 2:
  Invoice Number: is
  Invoice Amount: 150
  Total Amount: 150
  Date: 15 August 2024

Text 3:
  Invoice Number: INV-99999
  Invoice Amount: 200
  Total Amount: None
  Date: 01 September 2024



In [36]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_embeddings(sentence, model, tokenizer):
    # Tokenize and encode the sentence
    inputs = tokenizer(sentence, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the embeddings from the last hidden layer
    layer_embeddings = outputs.last_hidden_state
    # Average pooling to get sentence embedding
    sentence_embedding = torch.mean(layer_embeddings, dim=1).squeeze().numpy()
    
    # Token embeddings
    token_embeddings = layer_embeddings.squeeze().numpy()
    
    # Extract tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())
    
    return tokens, token_embeddings, sentence_embedding

def get_phrase_embedding(phrase, model, tokenizer):
    # Tokenize and encode the phrase
    inputs = tokenizer(phrase, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the embeddings from the last hidden layer
    layer_embeddings = outputs.last_hidden_state
    # Average pooling to get phrase embedding
    phrase_embedding = torch.mean(layer_embeddings, dim=1).squeeze().numpy()
    
    return phrase_embedding

def find_best_match(target_phrase, sentence, model, tokenizer):
    # Compute embeddings for the entire sentence
    tokens, token_embeddings, sentence_embedding = get_embeddings(sentence, model, tokenizer)
    
    # Compute embedding for the target phrase
    target_embedding = get_phrase_embedding(target_phrase, model, tokenizer)
    
    # Calculate the similarity between the target phrase embedding and all token embeddings
    highest_similarity = -1
    best_match = None
    
    for i, token in enumerate(tokens):
        if token not in tokenizer.all_special_tokens and token.strip():
            token_embedding = token_embeddings[i].reshape(1, -1)  # Reshape for cosine_similarity
            target_embedding_reshaped = target_embedding.reshape(1, -1)  # Reshape for cosine_similarity
            
            # Calculate similarity between target phrase and each token
            token_similarity = calculate_similarity(target_embedding_reshaped, token_embedding)
            
            if token_similarity > highest_similarity:
                highest_similarity = token_similarity
                best_match = token
    
    return best_match, highest_similarity

def calculate_similarity(embedding1, embedding2):
    # Ensure that the embeddings are not NaN
    if np.isnan(embedding1).any() or np.isnan(embedding2).any():
        print("Warning: One of the embeddings contains NaN values. Similarity will be set to 0.")
        return 0.0
    
    return cosine_similarity(embedding1, embedding2)[0][0]

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

# Define the sentence
sentence = "Please pay the total amount of Rs 120, which includes the invoice amount of Rs 100 and tax of Rs 10."

# Example target phrases
targets = ["total amount", "invoice amount", "tax"]

for target in targets:
    match, similarity = find_best_match(target, sentence, model, tokenizer)
    print(f"Best match for '{target}': '{match}' with similarity {similarity:.4f}")


Best match for 'total amount': 'please' with similarity 0.4684
Best match for 'invoice amount': '##ice' with similarity 0.6334
Best match for 'tax': '.' with similarity 0.5107
