In [134]:
#!pip install git+https://github.com/thunlp/OpenNRE.git
#!pip install spacy
#!pip install git+https://github.com/huggingface/neuralcoref.git
#!pip install neuralcoref

#!pip install spacy==2.1.0

#!pip uninstall neuralcoref -y
#!pip install neuralcoref

#!python -m spacy download en_core_web_sm

In [1]:
import pandas as pd
import opennre
import spacy
import neuralcoref

import urllib
from string import punctuation
import nltk
import json
import itertools

2021-08-07 12:19:45,640 - neuralcoref - INFO - Loading model from /Users/rishushrivastava/.neuralcoref_cache/neuralcoref


In [2]:
data = pd.read_excel('data/nyt10m_test.xlsx', engine='openpyxl')

len(data)

11086

In [3]:
spacy.__version__

'2.1.0'

In [4]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fc325042350>

In [5]:
doc = nlp('My sister has a dog. She loves him.')

In [6]:
doc._.coref_clusters

[My sister: [My sister, She], a dog: [a dog, him]]

In [7]:
doc._.coref_resolved

'My sister has a dog. My sister loves a dog.'

In [8]:
doc = nlp('David went to the concert. He said it was an amazing experience.')
doc._.coref_clusters,doc._.coref_resolved

([David: [David, He]],
 'David went to the concert. David said it was an amazing experience.')

In [9]:
with open('data/nyt10m_rel2id.json') as f:
    _ents = json.load(f)
    
ent_types = []
for k, v in _ents.items():
    _k = k.split('/')
    for k1 in _k:
        ent_types.append(k1)
    
list(set(ent_types))


['',
 'advisors',
 'neighborhood',
 'religion',
 'locations',
 'contains',
 'place_of_burial',
 'NA',
 'capital',
 'location',
 'founders',
 'ethnicity',
 'nationality',
 'place_of_birth',
 'administrative_division',
 'featured_film_locations',
 'region',
 'business',
 'majorshareholders',
 'country',
 'place_of_death',
 'deceasedperson',
 'company',
 'place_lived',
 'county_seat',
 'place_founded',
 'us_county',
 'geographic_distribution',
 'people',
 'children',
 'neighborhood_of',
 'film',
 'person',
 'time',
 'event',
 'administrative_divisions']

In [10]:
ENTITY_TYPES = ["human", "person", "company", "enterprise", "business", "geographic region",
                "human settlement", "geographic entity", "territorial entity type", "organization"]

In [11]:
def wikifier(text, lang="en", threshold=0.8):
    """Function that fetches entity linking results from wikifier.com API"""
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", "tgbdmkpmkluegqfbawcwjywieevmza"),
        ("pageRankSqThreshold", "%g" %
         threshold), ("applyPageRankSqThreshold", "true"),
        ("nTopDfValuesToIgnore", "100"), ("nWordsToIgnoreFromList", "100"),
        ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
        ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    
    url = "http://www.wikifier.org/annotate-article"
    # Call the Wikifier and read the response.
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # Output the annotations.
    results = list()
    
    for annotation in response["annotations"]:
        if ('wikiDataClasses' in annotation) and (any([el['enLabel'] in ENTITY_TYPES for el in annotation['wikiDataClasses']])):

            # Specify entity label
            if any([el['enLabel'] in ["human", "person"] 
                    for el in annotation['wikiDataClasses']]):
                label = 'Person'
            elif any([el['enLabel'] in ["company", "enterprise", "business", "organization"] 
                      for el in annotation['wikiDataClasses']]):
                label = 'Organization'
            elif any([el['enLabel'] in ["geographic region", "human settlement", "geographic entity", "territorial entity type"] 
                      for el in annotation['wikiDataClasses']]):
                label = 'Location'
            else:
                label = None

            results.append({'title': annotation['title'], 'wikiId': annotation['wikiDataItemId'], 'label': label,
                            'characters': [(el['chFrom'], el['chTo']) for el in annotation['support']]})
    return results

In [12]:
wikifier("Elon Musk is a business magnate, industrial designer, and engineer. Elon Musk is the founder, CEO, CTO, and chief designer of SpaceX")

[{'title': 'Elon Musk',
  'wikiId': 'Q317521',
  'label': 'Person',
  'characters': [(0, 8), (5, 8), (68, 76), (73, 76)]},
 {'title': 'SpaceX',
  'wikiId': 'Q193701',
  'label': 'Organization',
  'characters': [(126, 131)]}]

In [13]:
txt = data['text'][0]

txt

"The Little Comedy , '' a mannered operetta based on a short story by Arthur Schnitzler set in fin-de-si Ã¨cle Vienna , opens the evening ."

In [14]:
#data_resolved_text = []

doc = nlp(txt)
resolved_text = doc._.coref_resolved
#data_resolved_text.append({'text':t, 'resolved_text':resolved_text})
resolved_text

"The Little Comedy , '' a mannered operetta based on a short story by Arthur Schnitzler set in fin-de-si Ã¨cle Vienna , opens the evening ."

In [15]:
model = opennre.get_model('wiki80_cnn_softmax')
#model = model.cuda()

entities_threshold = 0.8

relations_list = []
# First get all the entities in the sentence
entities = wikifier(resolved_text, threshold=entities_threshold)

2021-08-07 12:20:09,078 - root - INFO - Initializing word embedding with word2vec.


In [16]:
entities

[{'title': 'Arthur Schnitzler',
  'wikiId': 'Q44331',
  'label': 'Person',
  'characters': [(69, 85), (76, 85)]},
 {'title': 'Vienna',
  'wikiId': 'Q1741',
  'label': 'Organization',
  'characters': [(110, 115)]}]

In [17]:
list(itertools.permutations(entities, 2))[0]

({'title': 'Arthur Schnitzler',
  'wikiId': 'Q44331',
  'label': 'Person',
  'characters': [(69, 85), (76, 85)]},
 {'title': 'Vienna',
  'wikiId': 'Q1741',
  'label': 'Organization',
  'characters': [(110, 115)]})

In [18]:
relation_threshold = 0.90
for permutation in itertools.permutations(entities, 2):
    for source in permutation[0]['characters']:
        for target in permutation[1]['characters']:
            # Relationship extraction with OpenNRE
            data = model.infer({'text': resolved_text, 
                                'h': {'pos': [source[0], source[1] + 1]}, 
                                't': {'pos': [target[0], target[1] + 1]}})
            
            if data[1] > relation_threshold:
                print (data[1],permutation[0]['title'],permutation[1]['title'], data[0])
            
                relations_list.append(
                    {'source': permutation[0]['title'], 
                     'target': permutation[1]['title'], 
                     'type': data[0]})
                
relations_list

0.9284580945968628 Arthur Schnitzler Vienna work location
0.9112037420272827 Arthur Schnitzler Vienna work location


[{'source': 'Arthur Schnitzler', 'target': 'Vienna', 'type': 'work location'},
 {'source': 'Arthur Schnitzler', 'target': 'Vienna', 'type': 'work location'}]

In [19]:
model.infer({'text': resolved_text, 
                                'h': {'pos': [69, 86]}, 
                                't': {'pos': [110, 117]}})

('work location', 0.9284580945968628)

In [20]:
#model = opennre.get_model('wiki80_cnn_softmax')

In [21]:
#model.infer({'text': 'He was the son of Máel Dúin mac Máele Fithrich, and grandson of the high king Áed Uaridnach (died 612).', 
#             'h': {'pos': (18, 46)}, 
#             't': {'pos': (78, 91)}})

In [97]:
def prepare_nlp(text):
    doc = nlp(txt)
    resolved_text = doc._.coref_resolved
    #data_resolved_text.append({'text':t, 'resolved_text':resolved_text})
    entities = wikifier(resolved_text, threshold=entities_threshold)
    return (resolved_text, entities)

In [99]:
#data['resolved'] = data['text'].apply(lambda x: prepare_nlp(x))

#data.head()

In [143]:
result.head(30)

Unnamed: 0,text,source,target,type,threshold
0,"The Little Comedy , '' a mannered operetta bas...",Arthur Schnitzler,Vienna,work location,0.928458
1,"The Little Comedy , '' a mannered operetta bas...",Arthur Schnitzler,Vienna,work location,0.911204
2,"The Little Comedy , '' a mannered operetta bas...",Vienna,Arthur Schnitzler,architect,0.608013
3,"The Little Comedy , '' a mannered operetta bas...",Vienna,Arthur Schnitzler,owned by,0.391916
4,A court in Rome acquitted five people accused ...,Arthur Schnitzler,Vienna,work location,0.928458
5,A court in Rome acquitted five people accused ...,Arthur Schnitzler,Vienna,work location,0.911204
6,A court in Rome acquitted five people accused ...,Vienna,Arthur Schnitzler,architect,0.608013
7,A court in Rome acquitted five people accused ...,Vienna,Arthur Schnitzler,owned by,0.391916
8,There were also performers who were born in Lo...,Arthur Schnitzler,Vienna,work location,0.928458
9,There were also performers who were born in Lo...,Arthur Schnitzler,Vienna,work location,0.911204


In [145]:
result2 = result.copy()

In [149]:
result2[['text','source','target','type']][result2.threshold == result2['threshold'].max()]



Unnamed: 0,text,source,target,type
0,"The Little Comedy , '' a mannered operetta bas...",Arthur Schnitzler,Vienna,work location
4,A court in Rome acquitted five people accused ...,Arthur Schnitzler,Vienna,work location
8,There were also performers who were born in Lo...,Arthur Schnitzler,Vienna,work location
12,"When Julian Resuello , the mayor of San Carlos...",Arthur Schnitzler,Vienna,work location
16,"When Julian Resuello , the mayor of San Carlos...",Arthur Schnitzler,Vienna,work location
...,...,...,...,...
44324,"Staten Island Steve Whalen , an aspiring filmm...",Arthur Schnitzler,Vienna,work location
44328,"Lukacs , a distinguished historian of 20th-cen...",Arthur Schnitzler,Vienna,work location
44332,Since accepting an invitation in 1996 from the...,Arthur Schnitzler,Vienna,work location
44336,"LOYOLA , MD. 62 , ST. PETER 'S 55 -- Michael T...",Arthur Schnitzler,Vienna,work location


In [152]:
data['text'].apply(lambda x: prepare_nlp(x))

KeyboardInterrupt: 