In [1]:
import os
import csv

import spacy
from spacy import displacy

import docx

In [2]:
# Directory containing .txt files

data_dir = os.path.join('..', '..', 'dataset', 'rental-agreement', 'txt')
print(data_dir)

..\..\dataset\rental-agreement\txt


In [3]:
# Read the content out of each file, store as list of strings

agreements = list()
for fname in os.listdir(data_dir):
    with open(os.path.join(data_dir, fname), 'r', encoding='utf-8') as f:
        agreements.append(f.read())

In [4]:
# Function that takes in a spacy doc object, extracts entity list for the doc
# For each entity, creates the tuple: (text of the entity, start index, end index (exclusive), label of the entity)
# Returns the list of such tuples

def collect_entities(doc):
    ent_texts = list()
    entities = doc.ents
    
    for entity in entities:
        ent_texts.append(
            tuple((entity.text, entity.start_char, str(entity.end_char), entity.label_)))
   
    return ent_texts

In [5]:
# Function that writes the list of entity info tuples into the csv specified

def write_to_csv(entity_list, header_list, path):
    with open(path, 'w', newline='') as csv_file:
        csvwriter = csv.writer(csv_file, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(header_list)
        [csvwriter.writerow(entity_text) for entity_text in entity_list]
        
    return

In [6]:
# Load the spacy model (large size)
nlp = spacy.load('en_core_web_lg')

# Headers for csv file
header_list = ['entity', 'start_char', 'end_char', 'label']

In [7]:
# For each file text, create doc object and call the functions to perform NER and write the result in a separate csv

for i, agr in enumerate(agreements):
    doc = nlp(agr)
    ent_texts = collect_entities(doc)
    
    filepath = os.path.join(data_dir, '..', 'ner', str(i)+'.csv')
    write_to_csv(ent_texts, header_list, filepath)

In [8]:
# Sample rendering of NER on a file text using displacy

render_text = nlp(agreements[0])
displacy.render(render_text, style='ent', jupyter=True)