In [1]:
import os
import csv

from tqdm.notebook import tqdm
!pip install allennlp
from allennlp.predictors import Predictor
from nltk.tokenize import word_tokenize
!pip install allennlp_models

# import allennlp_models




In [2]:
# Directory containing .txt files
data_dir = os.path.join('/content', 'rental-agreement', 'txt')
print(data_dir)


/content/rental-agreement/txt


In [3]:
# Read the content out of each file, store as list of strings

n_agreements = 43
agreements = list()
for i in range(n_agreements):
    fname = str(i) + '.txt'
    with open(os.path.join(data_dir, fname), 'r', encoding='utf-8') as f:
        agreements.append(f.read())



In [13]:
# Function that takes in a stanza doc object, extracts entity list for the doc
# For each entity, creates the tuple: (text of the entity, start index, end index (exclusive), label of the entity)
# Returns the list of such tuples

def collect_entities(doc):
    ent_texts= list()
    words = doc['words']
    tags = doc['tags']
    for w,t in zip(words,tags):
        ent_texts.append(
            tuple((w, t)))

    return ent_texts

In [5]:
# Function that writes the list of entity info tuples into the csv specified

def write_to_csv(entity_list, header_list, path):
    with open(path, 'w', newline='') as csv_file:
        csvwriter = csv.writer(csv_file, delimiter=',', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(header_list)
        [csvwriter.writerow(entity_text) for entity_text in entity_list]

    return

In [None]:
# Load the model

nlp = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz")

results = nlp.predict(sentence="Did Uriah honestly think he could beat The Legend of Zelda in under three hours?")
for word, tag in zip(results["words"], results["tags"]):
    print(f"{word}\t{tag}")
results

In [12]:
results.keys()

dict_keys(['logits', 'mask', 'tags', 'words'])

In [16]:
# For each agreement, run NER and store the results to a separate csv file

# Headers for each NER csv file
ner_headers = ['words', 'tags']

for i in tqdm(range(n_agreements)):
    agreement = agreements[i]
    doc = nlp.predict(agreement)
    entity_list = collect_entities(doc)
    write_to_csv(entity_list, ner_headers, os.path.join('/content/rental-agreement/', 'allenNer', str(i)+'.csv'))

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))


