In [2]:
import pandas as pd
import opennre
import spacy
import neuralcoref

import urllib
from string import punctuation
import nltk
import json
import itertools

In [192]:
data = pd.read_excel('data/nyt10m_test.xlsx', engine='openpyxl')

data.head(20)

Unnamed: 0,text,h_name,t_name,relation
0,"The Little Comedy , '' a mannered operetta bas...",,,
1,A court in Rome acquitted five people accused ...,,,
2,There were also performers who were born in Lo...,,,
3,"When Julian Resuello , the mayor of San Carlos...",,,
4,"When Julian Resuello , the mayor of San Carlos...",,,
5,"A3 Concern Raised by Libya Deal Alex Salmond ,...",,,
6,"A3 Concern Raised by Libya Deal Alex Salmond ,...",,,
7,"Alex Salmond , the newly elected first ministe...",,,
8,"Alex Salmond , the newly elected first ministe...",,,
9,"Indeed , Mr. ChÃ¡vez said Sunday that he wishe...",,,


In [60]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7ff93e196310>

In [167]:
with open('data/nyt10m_rel2id.json') as f:
    _ents = json.load(f)
    
ent_types = []
for k, v in _ents.items():
    _k = k.split('/')
    for k1 in _k:
        ent_types.append(k1)
    
ENTITY_LIST = list(set(ent_types))
ENTITY_LIST = ENTITY_LIST + ["human", "person", "company", "enterprise", "business", "geographic region",
                "human settlement", "geographic entity", "territorial entity type", "organization"]

ENTITY_LIST.remove('')
ENTITY_LIST.remove('NA')

#_ents
ENTITY_LIST.sort()

ENTITY_LIST

['administrative_division',
 'administrative_divisions',
 'advisors',
 'business',
 'business',
 'capital',
 'children',
 'company',
 'company',
 'contains',
 'country',
 'county_seat',
 'deceasedperson',
 'enterprise',
 'ethnicity',
 'event',
 'featured_film_locations',
 'film',
 'founders',
 'geographic entity',
 'geographic region',
 'geographic_distribution',
 'human',
 'human settlement',
 'location',
 'locations',
 'majorshareholders',
 'nationality',
 'neighborhood',
 'neighborhood_of',
 'organization',
 'people',
 'person',
 'person',
 'place_founded',
 'place_lived',
 'place_of_birth',
 'place_of_burial',
 'place_of_death',
 'region',
 'religion',
 'territorial entity type',
 'time',
 'us_county']

In [168]:
def wikifier(text, lang="en", threshold=0.8):
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", "tgbdmkpmkluegqfbawcwjywieevmza"),
        ("pageRankSqThreshold", "%g" %threshold), ("applyPageRankSqThreshold", "true"),
        ("nTopDfValuesToIgnore", "100"), ("nWordsToIgnoreFromList", "100"),
        ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
        ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    
    url = "http://www.wikifier.org/annotate-article"
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))

    results = []
    
    for annotation in response["annotations"]:
        if ('wikiDataClasses' in annotation) and (any([el['enLabel'] in ENTITY_TYPES for el in annotation['wikiDataClasses']])):
                
            results.append({'title':annotation['title'], 
                            #'wikiId': annotation['wikiDataItemId']},
                            'characters': [(el['chFrom'], el['chTo']) for el in annotation['support']],
                            #'label': [el['enLabel'] for el in annotation['wikiDataClasses']][0:5]
                           })
    
    return results

In [169]:
test1="Elon Musk is a business magnate, industrial designer, and engineer. He is the founder, CEO, CTO, and chief designer of SpaceX"
wikifier(test1)

[{'title': 'Elon Musk', 'characters': [(0, 8), (5, 8)]},
 {'title': 'SpaceX', 'characters': [(119, 124)]}]

In [170]:
def resolved_text_fn(text):
    return text._.coref_resolved

In [171]:
resolved_text_fn(nlp(test1))

'Elon Musk is a business magnate, industrial designer, and engineer. Elon Musk is the founder, CEO, CTO, and chief designer of SpaceX'

In [91]:
df = data[['text']].copy()

text_data = df['text'].tolist()
text_data_str = [ ''+str(td) for td in text_data]

#text_data_str

resolved_text = []

for td in nlp.pipe(text_data_str):
    resolved_text.append({'text':td, 'resolved_text':resolved_text_fn(td)})
    
resolved_text[0:10]

[{'text': The Little Comedy , '' a mannered operetta based on a short story by Arthur Schnitzler set in fin-de-si Ã¨cle Vienna , opens the evening .,
  'resolved_text': "The Little Comedy , '' a mannered operetta based on a short story by Arthur Schnitzler set in fin-de-si Ã¨cle Vienna , opens the evening ."},
 {'text': A court in Rome acquitted five people accused of conspiring to murder Roberto Calvi , who was president of Banco Ambrosiano , one of Italy 's largest private banks , and a financial adviser to the Vatican when he was found hanged under Blackfriars Bridge in London in 1982 , his pockets stuffed with rocks , bricks and cash .,
  'resolved_text': "A court in Rome acquitted five people accused of conspiring to murder Roberto Calvi , who was president of Banco Ambrosiano , one of Italy 's largest private banks , and a financial adviser to the Vatican when he was found hanged under Blackfriars Bridge in London in 1982 , he pockets stuffed with rocks , bricks and cash ."},
 {'

In [53]:
model = opennre.get_model('wiki80_cnn_softmax')

2021-08-07 18:22:30,809 - root - INFO - Initializing word embedding with word2vec.


In [46]:
with open('data/nyt10m_rel2id.json') as f:
    _ents2 = json.load(f)
    
ent_types2 = []
for k, v in _ents2.items():
    _k = k.split('/')
    for k1 in _k:
        if k1 != '' and k1 != 'NA':
            ent_types2.append((k,k1))
    
_ents2

{'/people/person/nationality': 2,
 '/time/event/locations': 22,
 '/people/person/children': 14,
 '/business/company/advisors': 19,
 '/business/location': 18,
 '/business/company/majorshareholders': 16,
 '/people/person/place_lived': 5,
 'NA': 0,
 '/business/company/place_founded': 11,
 '/location/neighborhood/neighborhood_of': 8,
 '/people/deceasedperson/place_of_death': 4,
 '/film/film/featured_film_locations': 21,
 '/location/region/capital': 23,
 '/business/company/founders': 6,
 '/people/ethnicity/geographic_distribution': 17,
 '/location/country/administrative_divisions': 12,
 '/people/deceasedperson/place_of_burial': 24,
 '/location/country/capital': 13,
 '/business/person/company': 9,
 '/location/location/contains': 1,
 '/location/administrative_division/country': 10,
 '/location/us_county/county_seat': 20,
 '/people/person/religion': 15,
 '/people/person/place_of_birth': 3,
 '/people/person/ethnicity': 7}

In [210]:
from operator import itemgetter

def find_relation(entities, r_text):
    permutation_value = []
    for permutation in itertools.permutations(entities, 2):
        for source in permutation[0]['characters']:
            for target in permutation[1]['characters']:
                if source != None or target != None:
                    data = model.infer({'text': r_text, 
                                        'h': {'pos': [source[0], source[1] + 1]}, 
                                        't': {'pos': [target[0], target[1] + 1]}})
                
                permutation_value.append((data[1],permutation[0]['title'], permutation[1]['title'], data[0]))
    
    max_item = max(permutation_value, key=itemgetter(0), default=0)
    #print("xxx",max_item)
    return max_item

In [211]:
entities_threshold = 0.8
relations_list = []

for i, rt in enumerate(resolved_text):
    t1 = rt['text']
    t2 = rt['resolved_text']
    
    entities = wikifier(t2, threshold=entities_threshold)
    #find_relation(entities, t2)
    
    #print(i, ">> ", entities)
    #print(t1)
    
    rel_max_data = find_relation(entities, t2)
    
    if rel_max_data == 0:
        relations_list.append({'text':t1,
                           'h_name':'NA',
                           't_name':'NA',
                           'relation':'NA'})
    else:
        relations_list.append({'text':t1,
                           'h_name':rel_max_data[1],
                           't_name':rel_max_data[2],
                           'relation':rel_max_data[3]})
    
result_df = pd.DataFrame(relations_list)
result_df.to_csv('data/result.csv', index=False)

## Preparing the final result

In [3]:
rdf = pd.read_csv('data/result.csv')

rdf.head()

Unnamed: 0,text,h_name,t_name,relation
0,"The Little Comedy , '' a mannered operetta bas...",Arthur Schnitzler,Vienna,work location
1,A court in Rome acquitted five people accused ...,Banco Ambrosiano,Roberto Calvi,head of government
2,There were also performers who were born in Lo...,Johnny Rivers,Jerry Lee Lewis,tributary
3,"When Julian Resuello , the mayor of San Carlos...","San Carlos, Pangasinan",Philippines,country
4,"When Julian Resuello , the mayor of San Carlos...","San Carlos, Pangasinan",Philippines,country


In [40]:
rdf.to_excel('data/result.xlsx', index=None, header=True)

In [4]:
rdf_cpy = rdf.copy()

In [5]:
rdf_cpy.columns = ['text','h_name','t_name','final_relation']

rdf_cpy.head()

Unnamed: 0,text,h_name,t_name,final_relation
0,"The Little Comedy , '' a mannered operetta bas...",Arthur Schnitzler,Vienna,work location
1,A court in Rome acquitted five people accused ...,Banco Ambrosiano,Roberto Calvi,head of government
2,There were also performers who were born in Lo...,Johnny Rivers,Jerry Lee Lewis,tributary
3,"When Julian Resuello , the mayor of San Carlos...","San Carlos, Pangasinan",Philippines,country
4,"When Julian Resuello , the mayor of San Carlos...","San Carlos, Pangasinan",Philippines,country


In [94]:
rels = rdf_cpy['final_relation'].unique()

mapping = []
for r in rels:
    
    if r in ['winner',
                 'sport',
                 'successful candidate','occupation']:
        label = '/people/person/nationality'
        
    elif r in ['work location','part of','participant','participant of',
                      'participating team',
                      'sports season of league or competition',
                      'followed by',
                      'follows','location',
                      'location of formation','notable work']:
        label = '/time/event/locations'
        
    elif r in ['child',
                      'competition class']:
        label = '/people/person/children'
        
    elif r in ['director',
                      'head of government',
                      'member of',
                      'member of political party',
                      'licensed to broadcast to']:
        label = '/business/company/advisors'
        
    elif r in ['distributor',
                      'field of work',
                      'performer',
                      'place served by transport hub','operator']:
        label = '/business/location'
        
    elif r in ['position played on team / speciality']:
        label = '/business/company/majorshareholders'
        
    elif r in ['country of citizenship']:
        label = '/people/person/place_lived'
        
    elif r in ['developer']:
        label = '/business/company/place_founded'
        
    elif r in ['residence',
                      'occupant']:
        label = '/location/neighborhood/neighborhood_of'
        
    #elif any(r in ['']):
    #    label = '/people/deceasedperson/place_of_death'
        
    elif r in ['after a work by',
                      'composer',
                      'instrument']:
        label = '/film/film/featured_film_locations'
        
   # elif any(r in ['']):
   #     label = '/location/region/capital'
        
    elif r in ['publisher',
                      'record label',
                      'screenwriter',
                      'owned by']:
        label = '/business/company/founders'
        
    #elif any(r in ['']):
    #    label = '/people/ethnicity/geographic_distribution'
        
    elif r in ['applies to jurisdiction',
                      'headquarters location',
                      'position held',
                      'located in the administrative territorial entity']:
        label = '/location/country/administrative_divisions'
        
    #elif any(r in ['']):
    #    label = '/people/deceasedperson/place_of_burial'
        
    elif r in ['country']:
        label = '/location/country/capital'
        
    elif r in ['characters',
                      'subsidiary',
                      'instance of',
                      'manufacturer',
                      'main subject']:
        label = '/business/person/company'
        
    elif r in ['architect','constellation','heritage designation','league','genre',
                      'has part',
                      'tributary',
                      'located in or next to body of water',
                      'located on terrain feature',
                      'mountain range','mouth of the watercourse']:
        label = '/location/location/contains'
        
    elif r in ['contains administrative territorial entity','military branch','military rank']:
        label = '/location/administrative_division/country'
        
    #elif any(r in ['']):
    #    label = '/location/us_county/county_seat'
        
    elif r in ['religion']:
        label = '/people/person/religion'
        
    elif r in ['sibling','father','spouse','mother']:
        label = '/people/person/place_of_birth'
        
    elif r in ['country of origin','language of work or name']:
        label = '/people/person/ethnicity'
    else:
        label = 'NA'
        
    mapping.append({'rel':r,'label':label})
    
mapping

[{'rel': 'work location', 'label': '/time/event/locations'},
 {'rel': 'head of government', 'label': '/business/company/advisors'},
 {'rel': 'tributary', 'label': '/location/location/contains'},
 {'rel': 'country', 'label': '/location/country/capital'},
 {'rel': 'country of citizenship', 'label': '/people/person/place_lived'},
 {'rel': 'part of', 'label': '/time/event/locations'},
 {'rel': 'position held',
  'label': '/location/country/administrative_divisions'},
 {'rel': nan, 'label': 'NA'},
 {'rel': 'member of', 'label': '/business/company/advisors'},
 {'rel': 'subsidiary', 'label': '/business/person/company'},
 {'rel': 'instance of', 'label': '/business/person/company'},
 {'rel': 'member of political party', 'label': '/business/company/advisors'},
 {'rel': 'followed by', 'label': '/time/event/locations'},
 {'rel': 'location', 'label': '/time/event/locations'},
 {'rel': 'has part', 'label': '/location/location/contains'},
 {'rel': 'residence', 'label': '/location/neighborhood/neighbo

In [126]:
def get_mapping(val):
    match = 'NA'
    for m in mapping:
        if m['rel'] == val:
            match = m['label']
            
    return match

In [127]:
get_mapping('developer')

'/business/company/place_founded'

In [128]:
rdf_cpy['relation'] = rdf_cpy['final_relation'].apply(lambda x: get_mapping(x))

rdf_cpy.head()

Unnamed: 0,text,h_name,t_name,final_relation,relation
0,"The Little Comedy , '' a mannered operetta bas...",Arthur Schnitzler,Vienna,work location,/time/event/locations
1,A court in Rome acquitted five people accused ...,Banco Ambrosiano,Roberto Calvi,head of government,/business/company/advisors
2,There were also performers who were born in Lo...,Johnny Rivers,Jerry Lee Lewis,tributary,/location/location/contains
3,"When Julian Resuello , the mayor of San Carlos...","San Carlos, Pangasinan",Philippines,country,/location/country/capital
4,"When Julian Resuello , the mayor of San Carlos...","San Carlos, Pangasinan",Philippines,country,/location/country/capital


In [129]:
rdf_cpy.to_excel('data/result_final.xlsx', index=None, header=True)