In [1]:
import pandas as pd
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
from nltk.parse import CoreNLPParser
import re
#!pip install -U nltk[corenlp]

In [2]:
# read input file and convert to a pandas dataset
path = "./data/SOTA_datasets/KBP37/train.txt"

data_dict = dict()
with open(path) as file:
    count = 0
    sentence = ""
    label = ""
    for line in file:
        count += 1
        if count == 1:
            sentence = line[line.find("\t")+2:-2]
        elif count == 2:
            label = line[:-1]
        elif count == 4:
            data_dict[sentence] = label
            count = 0
            
dataset = pd.DataFrame(list(data_dict.items()), columns=["sentence","relation"])
#df.index +=1

## Data Analysis
#### Set of relations (18 bidirecitonal relations --> 36 + 1 artificial class = 37)
org prefix stands for "organization" whil per stands for "person"

In [3]:
# counting occurrences per relation
temp = dataset
temp["rel"] = temp["relation"].str.partition("(")[0]
#temp = temp.drop(columns="relation")
temp.groupby("rel").size().to_frame("count").reset_index()

Unnamed: 0,rel,count
0,no_relation,1544
1,org:alternate_names,511
2,org:city_of_headquarters,1267
3,org:country_of_headquarters,1005
4,org:founded,393
5,org:founded_by,355
6,org:members,703
7,org:stateorprovince_of_headquarters,517
8,org:subsidiaries,830
9,org:top_members/employees,575


## Data Preparation

In [4]:
# take as input the sentence and a boolean used to choose the entity to return
def getEntity(sentence, first):
    if first: 
        return re.search('<e1>(.*)</e1>', sentence).group(1)
    return re.search('<e2>(.*)</e2>', sentence).group(1)

In [5]:
# Removing directionality from relations
dataset["rel"] = dataset["relation"].str.partition("(")[0]
#dataset = dataset.drop(columns="relation")

In [6]:
# add e1 and e2 columns
dataset["e1"] = dataset.apply (lambda row: getEntity(row["sentence"], True), axis = 1)
dataset["e2"] = dataset.apply (lambda row: getEntity(row["sentence"], False), axis = 1)
dataset.head(5)

Unnamed: 0,sentence,relation,rel,e1,e2
0,<e1> Thom Yorke </e1> of <e2> Radiohead </e2>...,"per:employee_of(e1,e2)",per:employee_of,Thom Yorke,Radiohead
1,<e1> Leland High School </e1> is a public hig...,"org:city_of_headquarters(e1,e2)",org:city_of_headquarters,Leland High School,San Jose
2,The 2008 Ohio Bobcats football team represent...,"org:members(e2,e1)",org:members,Ohio University,NCAA
3,<e1> Holy Cross High School </e1> is a Cathol...,"org:founded_by(e1,e2)",org:founded_by,Holy Cross High School,Congregation of Holy Cross
4,Hastings was unable to confirm news reports t...,"per:employee_of(e2,e1)",per:employee_of,Democratic,Bill Gwatney


## NER annotations
Launch this command from the dir stanford-corenlp-full-2018-02-27 to start the server: 

java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \
-preload tokenize,ssplit,pos,lemma,ner,parse,depparse \
-status_port 9000 -port 9000 -timeout 15000 &  

In [7]:
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')

In [39]:
# given a sentence it resturns one set of types for e1 and one for e2
def getEntityTypes(sentence, ner_type):
    sentence_clean = sentence.replace("<e1>", "").replace("</e1>", "").replace("<e2>", "").replace("</e2>", "")
    
    try:
        # Stanford CoreNLP simple NER tagger
        if ner_type == "CNLP":
            sentence_ner = ner_tagger.tag(sentence_clean.split())
            index = 0;
            e1_start, e1_end = 0, 0
            e2_start, e2_end = 0, 0
            for tok in sentence.split():
                if tok =='<e1>':
                    e1_start = index
                elif tok =='</e1>':
                    e1_end = index-2    
                elif tok =='<e2>':
                    e2_start = index-2
                elif tok =='</e2>':
                    e2_end = index-4
                index += 1

            e1_types, e2_types= set(), set()
            for i in (e1_start, e1_end):
                e1_types.add(sentence_ner[i][1])
            for i in (e2_start, e2_end):
                e2_types.add(sentence_ner[i][1])

            #remove 'O' from types
            e1_types.discard('O')
            e2_types.discard('O')
            
            #TO-DO
            #anzichè usare un set usare una lista è ritornare il tag più numeroso
    except:
        print(sentence)

    return e1_types, e2_types

In [9]:
# adding e1_type and e2_type columns
ner_type = "CNLP"
count = 0
for index in  dataset.index:
    sentence = dataset.loc[[index], 'sentence'].values[0]
    e1_Type, e2_Type = getEntityTypes(sentence, ner_type)
    dataset.at[index,"e1_type_"+ner_type] = ",".join(e1_Type)
    dataset.at[index,"e2_type_"+ner_type] = ",".join(e2_Type)
    
    count +=1
    if count %500==0:
        print(count)

500
 <e1> Russian Premier League </e1> :1997 1998 1999 2000 2001 Russian Cup : 1998 With <e2> FC Spartak Moscow </e2> . 
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
 <e1> His </e1> third album Benny . . . at Home has met critical acclaim outside of the <e2> Netherlands </e2> . 
14500
15000
15500


In [42]:
dataset.head(5)

Unnamed: 0,sentence,relation,rel,e1,e2,e1-type_CNLP,e2_type_CNLP
0,<e1> Thom Yorke </e1> of <e2> Radiohead </e2>...,"per:employee_of(e1,e2)",per:employee_of,Thom Yorke,Radiohead,PERSON,ORGANIZATION
1,<e1> Leland High School </e1> is a public hig...,"org:city_of_headquarters(e1,e2)",org:city_of_headquarters,Leland High School,San Jose,ORGANIZATION,CITY
2,The 2008 Ohio Bobcats football team represent...,"org:members(e2,e1)",org:members,Ohio University,NCAA,ORGANIZATION,ORGANIZATION
3,<e1> Holy Cross High School </e1> is a Cathol...,"org:founded_by(e1,e2)",org:founded_by,Holy Cross High School,Congregation of Holy Cross,ORGANIZATION,ORGANIZATION
4,Hastings was unable to confirm news reports t...,"per:employee_of(e2,e1)",per:employee_of,Democratic,Bill Gwatney,,PERSON


In [43]:
# TO-DO
# basic imputation (e1_type and e2_type): for missing values use the most frequent concept for the relation

In [50]:
# TO-DO
# noisy values (e1_type and e2_type): if there are more types for a cell, choose the most frequent for the relation 
# or use rules (if country and city co-occurs, then choose the most specific)

In [53]:
# save/read
#dataset.to_pickle("kbp37_CNLP.pickle") 
dataset = pd.read_pickle("kbp37_CNLP.pickle")

## To check:
- controllare la qualità del dataset, se non trovi a riguardo cerca per TACRED. Ad esempio la seguente annotazione mi sembra sbagliata: 

    " Russia is to sell off the major foreign assets of bankrupt <e1> oil group </e1> <e2> Yukos </e2> at auction in August with a starting price of just under 300 million dollars Russia's Federal Property Fund announced Saturday . " 
    org:alternate_names(e2,e1)

    avrei messo title(e2,e1) ma forse title esiste solo per le persone ... o cmq è una relazione ISA non condificata nel dataset
- idem alcune istanze di contry_of_birth()