In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")




  from .autonotebook import tqdm as notebook_tqdm


Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, device='cuda')


In [4]:
data = [
    {
        "original_email": "Date: Mon, 5 Feb 2001 03:26:00 -0800 (PST)\nFrom: errol.mclaughlin@enron.com\nTo: jeffrey.gossett@enron.com\nSubject: Re: G-Daily-Est book deals to be flipped updated list\nBody: \nLuchas has been working on them and will be finished within the hour.\nErrol",
        "subject": "Re: G-Daily-Est book deals to be flipped updated list",
        "timezone": "PST",
        "length": 14,
        "year": 2001,
        "month": "Feb",
        "recipients": 1,
        "cc_participants": 0,
        "is_reply": 1,
        "summary": "Luchas has been working on them and will be finished within the hour.",
        "response": "Date: Mon, 5 Feb 2001 03:26:00 -0800 (PST)\nFrom: jeffrey.gossett@enron.com\nTo: errol.mclaughlin@enron.com\nCC: \nSubject: Re: G-Daily-Est book deals to be flipped updated list\nBody: \nThanks for the update, Errol. Looking forward to seeing the finished list.\nJeffrey"
    },
    {
        "original_email": "Date: Fri, 12 Jan 2001 09:00:00 -0800 (PST)\nFrom: errol.mclaughlin@enron.com\nTo: gregory.carraway@enron.com\nSubject: Re: Gas Daily deals in dispute\nBody: \nGreg,\nSherry Dawson took care of these deals today.\nThanks,\nErrol McLaughlin, X5-8274",
        "subject": "Re: Gas Daily deals in dispute",
        "timezone": "PST",
        "length": 19,
        "year": 2001,
        "month": 1,
        "recipients": 1,
        "cc_participants": 0,
        "is_reply": 1,
        "summary": "This email is a reply regarding the Gas Daily deals in dispute. Sherry Dawson has taken care of these deals today. Thanks.",
        "response": "Date: Fri, 12 Jan 2001 09:00:00 -0800 (PST)\nFrom: gregory.carraway@enron.com\nTo: errol.mclaughlin@enron.com\nCC:\nSubject: Re: Gas Daily deals in dispute\nBody:\nErrol,\n\nThank you for letting me know that Sherry Dawson has taken care of the Gas Daily deals in dispute. I appreciate the update.\n\nBest,\nGreg"
    }
]

In [5]:
example = data[1]["response"]

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.97629625, 'index': 23, 'word': 'g', 'start': 50, 'end': 51}, {'entity': 'B-PER', 'score': 0.3854005, 'index': 24, 'word': '##re', 'start': 51, 'end': 53}, {'entity': 'B-PER', 'score': 0.5042706, 'index': 26, 'word': '##y', 'start': 56, 'end': 57}, {'entity': 'B-PER', 'score': 0.8863566, 'index': 28, 'word': 'car', 'start': 58, 'end': 61}, {'entity': 'I-PER', 'score': 0.6600193, 'index': 29, 'word': '##raw', 'start': 61, 'end': 64}, {'entity': 'I-PER', 'score': 0.631353, 'index': 30, 'word': '##ay', 'start': 64, 'end': 66}, {'entity': 'B-ORG', 'score': 0.48990163, 'index': 32, 'word': 'en', 'start': 67, 'end': 69}, {'entity': 'I-ORG', 'score': 0.87843573, 'index': 33, 'word': '##ron', 'start': 69, 'end': 72}, {'entity': 'B-PER', 'score': 0.919089, 'index': 38, 'word': 'er', 'start': 81, 'end': 83}, {'entity': 'B-PER', 'score': 0.95298254, 'index': 41, 'word': 'm', 'start': 87, 'end': 88}, {'entity': 'I-PER', 'score': 0.8076757, 'index': 42, 'word': '##c',

In [6]:
print(example)

Date: Fri, 12 Jan 2001 09:00:00 -0800 (PST)
From: gregory.carraway@enron.com
To: errol.mclaughlin@enron.com
CC:
Subject: Re: Gas Daily deals in dispute
Body:
Errol,

Thank you for letting me know that Sherry Dawson has taken care of the Gas Daily deals in dispute. I appreciate the update.

Best,
Greg


In [13]:
entities = ner_results.copy()

In [14]:
reconstructed_entities = []
current_entity = ""
current_type = None
start_index = None

for e in entities:
    if e['entity'].startswith("B-"):
        if current_entity:
            reconstructed_entities.append({
                "entity": current_type,
                "word": current_entity,
                "start": start_index,
                "end": end_index
            })
        current_entity = e['word']
        current_type = e['entity'][2:]
        start_index = e['start']
        end_index = e['end']
    else:
        current_entity += e['word'].replace("##", "")
        end_index = e['end']

if current_entity:
    reconstructed_entities.append({
        "entity": current_type,
        "word": current_entity,
        "start": start_index,
        "end": end_index
    })

In [15]:
reconstructed_entities

[{'entity': 'PER', 'word': 'g', 'start': 50, 'end': 51},
 {'entity': 'PER', 'word': '##re', 'start': 51, 'end': 53},
 {'entity': 'PER', 'word': '##y', 'start': 56, 'end': 57},
 {'entity': 'PER', 'word': 'carraway', 'start': 58, 'end': 66},
 {'entity': 'ORG', 'word': 'enron', 'start': 67, 'end': 72},
 {'entity': 'PER', 'word': 'er', 'start': 81, 'end': 83},
 {'entity': 'PER', 'word': 'mclaughlinron', 'start': 87, 'end': 103},
 {'entity': 'MISC', 'word': 'GasDaily', 'start': 125, 'end': 134},
 {'entity': 'PER', 'word': 'Erro', 'start': 158, 'end': 162},
 {'entity': 'PER', 'word': '##l', 'start': 162, 'end': 163},
 {'entity': 'PER', 'word': 'SherryDawson', 'start': 201, 'end': 214},
 {'entity': 'MISC', 'word': 'GasDaily', 'start': 237, 'end': 246},
 {'entity': 'PER', 'word': 'Best', 'start': 291, 'end': 295},
 {'entity': 'PER', 'word': 'Greg', 'start': 297, 'end': 301}]

In [16]:
original_sentence = example # the original sentence should be provided here

masked_sentence = original_sentence
offset = 0

for entity in reconstructed_entities:
    replacement = "<person>" if entity["entity"] == "PER" else "<org>"
    start = entity["start"] + offset
    end = entity["end"] + offset
    masked_sentence = masked_sentence[:start] + replacement + masked_sentence[end:]
    offset += len(replacement) - (end - start)

masked_sentence


'Date: Fri, 12 Jan 2001 09:00:00 -0800 (PST)\nFrom: <person><person>gor<person>.<person>@<org>.com\nTo: <person>rol.<person>.com\nCC:\nSubject: Re: <org> deals in dispute\nBody:\n<person><person>,\n\nThank you for letting me know that <person> has taken care of the <org> deals in dispute. I appreciate the update.\n\n<person>,\n<person>'

In [18]:
import re

In [19]:
masked_sentence = re.sub(r'(<person>)+', '<person>', masked_sentence)

# Replace multiple adjacent <org> with a single <org>
masked_sentence = re.sub(r'(<org>)+', '<org>', masked_sentence)

print(masked_sentence)

Date: Fri, 12 Jan 2001 09:00:00 -0800 (PST)
From: <person>gor<person>.<person>@<org>.com
To: <person>rol.<person>.com
CC:
Subject: Re: <org> deals in dispute
Body:
<person>,

Thank you for letting me know that <person> has taken care of the <org> deals in dispute. I appreciate the update.

<person>,
<person>
