In [1]:
import os
import csv

import stanza

In [2]:
# Directory containing .txt files

data_dir = os.path.join('..', '..', 'dataset', 'rental-agreement', 'txt')
print(data_dir)

../../dataset/rental-agreement/txt


In [3]:
# Read the content out of each file, store as list of strings

agreements = list()
for fname in os.listdir(data_dir):
    with open(os.path.join(data_dir, fname), 'r', encoding='utf-8') as f:
        agreements.append(f.read())

In [4]:
# Function that takes in a spacy doc object, extracts entity list for the doc
# For each entity, creates the tuple: (text of the entity, start index, end index (exclusive), label of the entity)
# Returns the list of such tuples

def collect_entities(doc):
    ent_texts = list()
    entities = doc.ents
    
    for entity in entities:
        ent_texts.append(
            tuple((entity.text, entity.start_char, entity.end_char, entity.type)))
   
    return ent_texts

In [5]:
# Function that writes the list of entity info tuples into the csv specified

def write_to_csv(entity_list, header_list, path):
    with open(path, 'w', newline='') as csv_file:
        csvwriter = csv.writer(csv_file, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(header_list)
        [csvwriter.writerow(entity_text) for entity_text in entity_list]
        
    return

In [6]:
# Load the stanza pipeline
nlp = stanza.Pipeline(lang='en')

2021-02-03 11:08:20 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-03 11:08:20 INFO: Use device: cpu
2021-02-03 11:08:20 INFO: Loading: tokenize
2021-02-03 11:08:21 INFO: Loading: pos
2021-02-03 11:08:23 INFO: Loading: lemma
2021-02-03 11:08:24 INFO: Loading: depparse
2021-02-03 11:08:25 INFO: Loading: sentiment
2021-02-03 11:08:28 INFO: Loading: ner
2021-02-03 11:08:33 INFO: Done loading processors!


In [7]:
# For each agreement, run NER and store the results to a separate csv file

# Headers for each NER csv file
ner_headers = ['entity', 'start_char', 'end_char', 'label']

for i, agreement in enumerate(agreements):
    doc = nlp(agreement)
    entity_list = collect_entities(doc)
    write_to_csv(entity_list, ner_headers, os.path.join(data_dir, '..', 'stanza-ner', str(i)+'.csv'))