In [6]:
import csv
import os
!python -m pip install transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# import spacy
# from spacy import displacy
# !python -m spacy download en_core_web_lg

/content


In [7]:
# Directory containing .txt files

data_dir = os.path.join('dataset', 'rental-agreement', 'txt')
print(data_dir)

dataset/rental-agreement/txt


In [8]:
# Read the content out of each file, store as list of strings

agreements = list()
for fname in os.listdir(data_dir):
    with open(os.path.join(data_dir, fname), 'r', encoding='utf-8') as f:
        agreements.append(f.read())

In [23]:
# Function that takes in a spacy doc object, extracts entity list for the doc
# For each entity, creates the tuple: (text of the entity, start index, end index (exclusive), label of the entity)
# Returns the list of such tuples

def collect_entities(doc):
    ent_texts = list()
    
    for entity in doc:
        ent_texts.append(
            tuple((entity["word"],entity["score"],entity["entity"],entity["index"],entity["start"],entity["end"]))
            )
   
    return ent_texts

In [20]:
# Function that writes the list of entity info tuples into the csv specified

def write_to_csv(entity_list, header_list, path):
    with open(path, 'w', newline='') as csv_file:
        csvwriter = csv.writer(csv_file, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(header_list)
        [csvwriter.writerow(entity_text) for entity_text in entity_list]
        
    return

In [21]:
# # Load the spacy model (large size)
# nlp = spacy.load('en_core_web_lg')
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)

print(ner_results)

[{'word': 'Wolfgang', 'score': 0.9990139603614807, 'entity': 'B-PER', 'index': 4, 'start': 11, 'end': 19}, {'word': 'Berlin', 'score': 0.9996449947357178, 'entity': 'B-LOC', 'index': 9, 'start': 34, 'end': 40}]


In [24]:
# Headers for csv file
header_list = ['word', 'score', 'entity', 'index','start','end']
# For each file text, create doc object and call the functions to perform NER and write the result in a separate csv

for i, agr in enumerate(agreements):
    doc = nlp(agr)
    ent_texts = collect_entities(doc)

    filepath = os.path.join(data_dir, '..', 'ner', str(i)+'.csv')
    write_to_csv(ent_texts, header_list, filepath)

In [25]:
# Sample rendering of NER on a file text using displacy

render_text = nlp(agreements[0])
# displacy.render(render_text, style='ent', jupyter=True)
render_text

[{'end': 73,
  'entity': 'B-LOC',
  'index': 19,
  'score': 0.9988933801651001,
  'start': 64,
  'word': 'Bangalore'},
 {'end': 122,
  'entity': 'B-PER',
  'index': 32,
  'score': 0.9985681772232056,
  'start': 121,
  'word': 'S'},
 {'end': 125,
  'entity': 'I-PER',
  'index': 34,
  'score': 0.9853574633598328,
  'start': 123,
  'word': 'Sa'},
 {'end': 127,
  'entity': 'I-PER',
  'index': 35,
  'score': 0.7655276656150818,
  'start': 125,
  'word': '##ku'},
 {'end': 129,
  'entity': 'I-LOC',
  'index': 36,
  'score': 0.30301573872566223,
  'start': 127,
  'word': '##nt'},
 {'end': 133,
  'entity': 'I-LOC',
  'index': 37,
  'score': 0.9920133352279663,
  'start': 129,
  'word': '##hala'},
 {'end': 137,
  'entity': 'B-LOC',
  'index': 39,
  'score': 0.9849275350570679,
  'start': 136,
  'word': 'H'},
 {'end': 155,
  'entity': 'B-LOC',
  'index': 51,
  'score': 0.9677917957305908,
  'start': 154,
  'word': 'R'},
 {'end': 157,
  'entity': 'B-LOC',
  'index': 52,
  'score': 0.90750920772552

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
agreements[0]

'RENTAL AGREEMENT\n\nThis Rental Agreement is made and executed at Bangalore on this the 1st May 2005 by and between:\n\nMrs. S.Sakunthala,\n\nHno: 4E,10-12-1,\n\nREDNAM ALCAZAR,\n\nOpp. SBI Main, Old Jail Road Jn,\n\nVishakapatnam - 530020\n\nHereinafter called as the “LESSOR” (which expression shall mean and include wherever the context so requires admits his heirs, executors, representatives and assigns) of the ONE PART;\n\nAnd:\n\nV.V.Ravi Kian„\n\nS/o V.R.G.Sastry,\n\nA.S.Rao Nagar,\n\nHyderabad Andhra Pradesh.-36\n\nHereinafter called the “LESSEE” (Which expression shall mean and include wherever the context so requires, admits his executors, representatives and assigns) of the OTHER PART.\n\nWITNESSES AS FOLLOWS:\n\nWhereas the Lessor is the absolute owner of the Fourth Floor residential premises bearing No:407A, T.C.Palya Main Road, Ram murthy nagar, Bangalore-16, which are morefully described in the Schedule written here under and hereinafter referred to as the “Schedule Prope