In [1]:
import os
from collections import defaultdict

import tqdm.notebook as tqdm
import spacy
import opennre

In [2]:
# Directory containing .txt files

data_dir = os.path.join('..', '..', 'dataset', 'rental-agreement', 'txt')
print(data_dir)

../../dataset/rental-agreement/txt


In [3]:
# Read the content out of each file, store as list of strings

agreements = list()
for fname in os.listdir(data_dir):
    with open(os.path.join(data_dir, fname), 'r', encoding='utf-8') as f:
        agreements.append(f.read())

In [4]:
# Load the OpenNRE pre-trained model
model = opennre.get_model('wiki80_cnn_softmax')
model

2021-02-05 15:46:31,642 - root - INFO - Initializing word embedding with word2vec.


SoftmaxNN(
  (sentence_encoder): CNNEncoder(
    (word_embedding): Embedding(400002, 50)
    (pos1_embedding): Embedding(80, 5, padding_idx=0)
    (pos2_embedding): Embedding(80, 5, padding_idx=0)
    (drop): Dropout(p=0.5, inplace=False)
    (conv): Conv1d(60, 230, kernel_size=(3,), stride=(1,), padding=(1,))
    (pool): MaxPool1d(kernel_size=40, stride=40, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=230, out_features=80, bias=True)
  (softmax): Softmax(dim=-1)
  (drop): Dropout(p=0.5, inplace=False)
)

In [5]:
# Illustration 1

text = 'He was the son of Máel Dúin mac Máele Fithrich, and grandson of the high king Áed Uaridnach (died 612).'
model.infer({'text': text, 'h': {'pos': (18, 46)}, 't': {'pos': (78, 91)}})
print(text[18:46])
print(text[78:91])
model.infer({'text': text, 'h': {'pos': (18, 46)}, 't': {'pos': (78, 91)}})

Máel Dúin mac Máele Fithrich
Áed Uaridnach


('father', 0.7500484585762024)

In [6]:
# Illustration 2

text = 'Donald Trump is the 45th president of the United States'
print(text[0:12])
print(text[38:55])
model.infer({'text': text, 'h': {'pos': (0, 12)}, 't': {'pos': (38, 55)}})

Donald Trump
the United States


('country of citizenship', 0.24717773497104645)

In [7]:
# spaCy model for NER

nlp = spacy.load('en_core_web_trf')
nlp

<spacy.lang.en.English at 0x7fa5020e07c0>

In [8]:
# Relation extraction on agreements using OpenNRE

agr_rels = defaultdict(list)
rels = list()
n_agrs = len(agreements)

for i_agr in tqdm.tqdm(range(3)):
    agreement = agreements[i_agr]
    doc = nlp(agreement)
    doc_ents = doc.ents
    n_ents = len(doc_ents)

    for i_ent in tqdm.tqdm(range(n_ents)):
        for j_ent in range(i_ent+1, n_ents):
            entity_i = doc_ents[i_ent]
            entity_j = doc_ents[j_ent]
            relation = model.infer({'text': agreements[0], 
                               'h': {'pos': (entity_i.start_char, entity_i.end_char)},
                               't': {'pos': (entity_j.start_char, entity_j.end_char)}})
            if relation[0] != 'NA':
                agr_rels[i_agr].append(relation)
                rels.append(relation)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

In [9]:
print(agr_rels)

defaultdict(<class 'list'>, {0: [('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part', 0.368480920791626), ('has part

In [10]:
rels

[('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 0.368480920791626),
 ('has part', 