In [14]:
from transformers import AutoTokenizer
from transformers import DistilBertForTokenClassification
import torch
from labels import id2label

In [15]:
tokenizer = AutoTokenizer.from_pretrained("seelennebel/AM_tokenizer")    
model = DistilBertForTokenClassification.from_pretrained("seelennebel/AM")

In [16]:
def output_AM(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_labels = torch.argmax(probabilities, dim=-1)
    predicted_classes = []
    for value in predicted_labels[0]:
        predicted_classes.append(id2label[int(value)])

    decoded_tokens = []
    decoded_text = ""
    for i, input_id in enumerate(inputs["input_ids"][0]):
        if predicted_labels[0][i] != 0:
            decoded_tokens.append(id2label[int(predicted_labels[0][i])])
        else:
            decoded_tokens.append(tokenizer.decode(input_id))

    truncated_decoded_tokens = []

    for token in decoded_tokens:
        if token == "[CLS]" or token == "[SEP]":
            continue
        if token[:2] == "I-" and token in id2label.values():
            continue 
        else:
            truncated_decoded_tokens.append(token)

    decoded_text = tokenizer.convert_tokens_to_string(truncated_decoded_tokens)

    print(decoded_text)

In [17]:
print(id2label)

{0: 'O', 1: 'B-CITY', 2: 'I-CITY', 3: 'B-SOCIALNUM', 4: 'I-SOCIALNUM', 5: 'B-CREDITCARDNUMBER', 6: 'I-CREDITCARDNUMBER', 7: 'B-EMAIL', 8: 'I-EMAIL', 9: 'B-IDCARDNUM', 10: 'I-IDCARDNUM', 11: 'B-GIVENNAME', 12: 'I-GIVENNAME', 13: 'B-BUILDINGNUM', 14: 'B-PASSWORD', 15: 'I-PASSWORD', 16: 'B-USERNAME', 17: 'I-USERNAME', 18: 'B-TAXNUM', 19: 'I-TAXNUM', 20: 'B-SURNAME', 21: 'I-SURNAME', 22: 'B-STREET', 23: 'I-STREET', 24: 'B-ZIPCODE', 25: 'I-ZIPCODE', 26: 'B-DATEOFBIRTH', 27: 'I-DATEOFBIRTH', 28: 'B-TELEPHONENUM', 29: 'I-TELEPHONENUM', 30: 'B-ACCOUNTNUM', 31: 'I-ACCOUNTNUM', 32: 'B-DRIVERLICENSENUM', 33: 'I-DRIVERLICENSENUM', 34: 'I-BUILDINGNUM'}


### Demonstration

In [18]:
text = """
Hello,
My name is Andrii Amitan. My student ID: GH1024543.
I am from Ukraine. I am 20 years old.
My address is Reiherweg, 4a,
Potsdam 14469.
"""
output_AM(text)

Hello, My name is B-GIVENNAME B-SURNAME. My student ID : B-IDCARDNUM. I am from B-CITY. I am 20 years old. My address is B-CITY, B-BUILDINGNUM, B-CITY.


In [19]:
text = """
Hello,

My name is Andrii Amitan. I would like to know if you received a payment from this credit card number:
6491736501947563
"""
output_AM(text)

Hello, My name is B-GIVENNAME B-SURNAME. I would like to know if you received a payment from this credit card number : B-CREDITCARDNUMBER


In [20]:
text = """
Hello,

I would like to know where to send my documents.
Is your email: SECRETemail@gmail.com?
"""
output_AM(text)

Hello, I would like to know where to send my documents. Is your email : B-EMAIL?
