In [1]:
import os

import csv
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [2]:
# Directory containing .txt files

data_dir = os.path.join('..', '..', 'dataset', 'rental-agreement', 'txt')
print(data_dir)

../../dataset/rental-agreement/txt


In [3]:
# Read the content out of each file, store as list of strings

agreements = list()
for fname in os.listdir(data_dir):
    with open(os.path.join(data_dir, fname), 'r', encoding='utf-8') as f:
        agreements.append(f.read())

In [4]:
# Load fine-tuned BERT for NER

tokenizer = AutoTokenizer.from_pretrained('dslim/bert-large-NER')
model = AutoModelForTokenClassification.from_pretrained('dslim/bert-large-NER')
tokenizer, model

Downloading:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

(PreTrainedTokenizer(name_or_path='dslim/bert-large-NER', vocab_size=28996, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}),
 BertForTokenClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(28996, 1024, padding_idx=0)
       (position_embeddings): Embedding(512, 1024)
       (token_type_embeddings): Embedding(2, 1024)
       (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0): BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=1024, out_features=1024, bias=True)
               (key): Linear(in_features=1024, out_features=1024, bias=True)
               (value): Linear(in_featu

In [5]:
# Load the transformers NER pipeline using BERT

nlp = pipeline('ner', tokenizer=tokenizer, model=model)
nlp

<transformers.pipelines.TokenClassificationPipeline at 0x7f9aece184f0>

In [6]:
# Illustration on a sample sentence
nlp('Donald Trump is the 45th president of the United States.')

[{'word': 'Donald',
  'score': 0.9986123442649841,
  'entity': 'B-PER',
  'index': 1},
 {'word': 'Trump', 'score': 0.9941266775131226, 'entity': 'I-PER', 'index': 2},
 {'word': 'United',
  'score': 0.9993253946304321,
  'entity': 'B-LOC',
  'index': 9},
 {'word': 'States',
  'score': 0.9987082481384277,
  'entity': 'I-LOC',
  'index': 10}]

In [7]:
# Function that takes NER results as a list of dictionaries, extracts entity list for the doc
# For each entity, creates the tuple: (text of the entity, start index, end index (exclusive), 
# label of the entity, score of the NER algo)
# Returns the list of such tuples

def collect_entities(ner_results):
    ent_texts = list()
    n_res = len(ner_results)
    i = 0
    
    while i < n_res:
        result = ner_results[i]
        entity_text = result['word']
        curr_tag = result['entity']
        
        # While loop to take in next tokens which are a part of the current word
        while True:
            if i < n_res-1:
                next_result = ner_results[i+1]
                next_word = next_result['word']
                next_tag = next_result['entity']
                
                # Whenever the next token starts with ##, just append it to the current token, 
                # without taking into consideration its individual label.
                if next_word[:2] == '##':
                    entity_text += next_word[2:]
                    i += 1
                
                # Whenever the next token starts with I and has the last label part same as the 
                # current token, append it to the current token with a space.
                elif next_tag[0] == 'I' and curr_tag[2:] == next_tag[2:]:
                    entity_text += ' ' + next_word
                    i += 1
                
                else:
                    break
            else:
                break
        
        end_char = result['index'] + len(entity_text) - 1
        label = result['entity'][2:]
        ent_texts.append(
            tuple((entity_text, result['index'], end_char, label, result['score'])))
        i += 1
   
    return ent_texts

In [8]:
# Function that writes the list of entity info tuples into the csv specified

def write_to_csv(entity_list, header_list, path):
    with open(path, 'w', newline='') as csv_file:
        csvwriter = csv.writer(csv_file, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(header_list)
        [csvwriter.writerow(entity_text) for entity_text in entity_list]
        
    return

In [9]:
# For each file text, run through the NER pipeline and 
# write the result in a separate csv

header_list = ['entity', 'start_index', 'end_index', 'label', 'score']

for i, agr in tqdm(enumerate(agreements)):
    ner_results = nlp(agr)
    ent_texts = collect_entities(ner_results)
    
    filepath = os.path.join(data_dir, '..', 'huggingface-bert_large-ner', str(i)+'.csv')
    write_to_csv(ent_texts, header_list, filepath)

43it [01:47,  2.51s/it]
