In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import json

import spacy
from spacy import displacy

from collections import Counter
import glob
import os


from spacy.util import minibatch, compounding
from spacy.util import decaying
import random
import re
from spacy.gold import GoldParse

%matplotlib inline

In [2]:
def getFileList(ftypes, start_path = '.'):
    
    file_list = []
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if fp.endswith(ftypes):
                file_list.append(fp)
    
    return file_list

def loadAnnotations(ann_files, txt_files, types):
    ANNOTATIONS = []
    
    for af, tf in zip(ann_files, txt_files):
        with open(af, 'r') as ann_data, open(tf, 'r',encoding='utf8') as text_data:
            text = text_data.read()
            res = []
            for line in ann_data:
                if line.startswith('T'):
                    content = line.split()
                    if content[1] in types:
                        res.append((int(content[2]), int(content[3]), content[1]))
            if res: ANNOTATIONS.append((text,{"entities": res}))
                
    return ANNOTATIONS

def change_annotation(listt):
    
    res = []
    
    for i in range(len(listt)):
        ent_vect = []
        entt = listt[i][1]["entities"]
        for j in range(len(entt)):
            if(entt[j][2] in ("WorldRegion","Country","LocalRegion","City")):
                ent_vect += [(entt[j][0],entt[j][1],"LOC")]
            elif(entt[j][2] in ("Organization ","Association","GPE","Company","Media")):
                ent_vect += [(entt[j][0],entt[j][1],"ORG")]
            elif(entt[j][2] == "Person"):
                ent_vect += [(entt[j][0],entt[j][1],"PER")] 
            elif(entt[j][2] == "Currency"):
                ent_vect += [(entt[j][0],entt[j][1],"CUR")] #'Currency': 'CUR'
            elif(entt[j][2] == "Activity"):
                ent_vect += [(entt[j][0],entt[j][1],"ACT")] #'Activity': 'ACT'
            elif(entt[j][2] == "Acquisition"):
                ent_vect += [(entt[j][0],entt[j][1],"ACQ")]
            elif(entt[j][2] == "Role"):
                ent_vect += [(entt[j][0],entt[j][1],"ROL")] #'Currency': 'ROL'
            elif(entt[j]==None or entt[j]==""): 
                pass
            else:
                ent_vect += [(entt[j])]
        res.append((listt[i][0],{"entities": ent_vect}))
    return res


In [12]:
# reads txt, ann files (BRAT) and returns annotated data in spacy format
def loadRelations(ann_files, txt_files):
    REL = []
    for af, tf in zip(ann_files, txt_files):
        with open(af, 'r') as ann_data, open(tf, 'r',encoding='utf8') as text_data:
            text = text_data.read()
            res = []
            relist = []
            annlist = []
            for line in ann_data:
                if line.startswith('R'):
                    content = line.split()
                    if content[1] in ['roleDepartment','hasRole']:
                        relist.append(content)
                if line.startswith('T'):
                    content = line.split()
                    annlist.append(content)
                        
            #print(relist)
            #print(annlist)
            for r in relist:
                #print(r)
                s1 = r[2][len('Arg1:'):]
                s2 = r[3][len('Arg2:'):]
                L = [r[1]]
                #print(s1,s2)
                for line in annlist:
                    if line[0]==s1:
                        #print(line)
                        #cont = line.split()
                        L.append((line[1],int(line[2]),int(line[3]),text[int(line[2]):int(line[3])]))
                    if line[0]==s2:
                        #cont1 = line.split()
                        L.append((line[1],int(line[2]),int(line[3]),text[int(line[2]):int(line[3])]))
                #print(L)
                if r[1] == 'hasRole':
                    if L[1][0]=='Person':
                        res += [(L[0],L[1][1:],L[2][1:])]
                    else:
                        res+= [(L[0],L[2][1:],L[1][1:])]
                if r[1] == 'roleDepartment':
                    if L[1][0]=='Role':
                        res+= [(L[0],L[1][1:],L[2][1:])]
                    else:
                        res+= [(L[0],L[2][1:],L[1][1:])]
            if res: REL.append((text,{"relations": res}))    
            
            
    return REL

In [4]:
ann4 = getFileList(('.ann'),'./Sample_cased')
txt4  =getFileList(('.txt'),'./Sample_cased')
TRAIN_DATA4 = change_annotation(loadAnnotations(ann4, txt4, ["Organization ","LOC","Role","WorldRegion","Country","LocalRegion","City","Association","GPE","Company","Media","Acquisition","Person","Currency"]))

In [5]:
rds = loadRelations(ann4, txt4)
rds

[("Plus jeune femme de sa génération promue associée chez Accenture après avoir fait ses classes au marketing d'Unilever, elle dirige la politique de développement durable du groupe Accor depuis 2010 et vient de prendre la direction des Académies, l'organe de formation interne du groupe. Ce qui propulse cette mère de deux enfants, formée à l'IEP Strasbourg et l'EM Lyon, au comité de direction mondial, le top 20 managérial de l'entreprise.",
  {'relations': [('roleDepartment',
     (41, 49, 'associée'),
     (55, 64, 'Accenture')),
    ('roleDepartment',
     (124, 168, 'dirige la politique de développement durable'),
     (179, 184, 'Accor'))]}),
 ("ACQUISITION\nC'est la guerre entre les Samsung, Apple, Google et autres HTC. Résultat paradoxal : la propriété industrielle freine l'innovation.\n Le monde du high-tech vient de s'inventer un nouveau conflit, celui des brevets. On a ainsi assisté tout l'été à des joutes entre Apple, Samsung, Google, HTC, Microsoft et autres acteurs du mobil

In [6]:
from nltk import tokenize
def splitRelations(old_train):
    new_train = []
    for article in old_train:
        old_ents = article[1]['relations']
        full_text = article[0]
        doc_sents = tokenize.sent_tokenize(article[0], language='french')
        
        for ind, sent in enumerate(doc_sents):
            new_ents = []
            sent_start = full_text.index(sent)
            sent_end = sent_start + len(sent)
            
            for item in old_ents:
                #print(item[1][1])
                if (sent_start <= item[1][1] <= sent_end) and (sent_start <= item[2][1] <= sent_end):
                    new_ents.append((item[0], (item[1][0]-sent_start, item[1][1]-sent_start, item[1][2]),(item[2][0]-sent_start, item[2][1]-sent_start, item[2][2])))
            if new_ents:
                new_train.append((sent, {'relations': new_ents}))
    return new_train

In [7]:
srds = splitRelations(rds)
srds

[("Plus jeune femme de sa génération promue associée chez Accenture après avoir fait ses classes au marketing d'Unilever, elle dirige la politique de développement durable du groupe Accor depuis 2010 et vient de prendre la direction des Académies, l'organe de formation interne du groupe.",
  {'relations': [('roleDepartment',
     (41, 49, 'associée'),
     (55, 64, 'Accenture')),
    ('roleDepartment',
     (124, 168, 'dirige la politique de développement durable'),
     (179, 184, 'Accor'))]}),
 ("Ainsi, au lieu d'être dédié à l'innovation, cet argent a été consacré à acheter une défense juridique contre l'activisme procédurier d'Apple.« Un smartphone peut mettre en jeu jusqu'à 250 000 brevets, expliquait alors David Drummond, directeur juridique de Google.",
  {'relations': [('hasRole',
     (218, 232, 'David Drummond'),
     (234, 253, 'directeur juridique')),
    ('roleDepartment',
     (234, 253, 'directeur juridique'),
     (257, 263, 'Google'))]}),
 ("Ce dernier, qu'on supposait

In [8]:
def extract_relations(doc):
    
    spans = list(doc.ents)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    for target in filter(lambda w: w.ent_type_ == 'ROL', doc):
        if target.dep_ == "obl":
            subject = [w for w in target.head.lefts if w.dep_ == "nsubj:pass"]
            if subject:
                if subject[0].ent_type_ == 'PER':
                    relations.append(('hasRole',str(subject[0]), str(target)))
                if subject[0].ent_type_ == 'ORG':
                    relations.append(('roleDepartment', str(target), str(subject[0])))

        if target.dep_ == "appos":
            if target.head.ent_type_ == 'PER':
                relations.extend([('hasRole',str(target.head), str(target))])
            if target.head.ent_type_ == 'ORG':
                relations.extend([('roleDepartment', str(target), str(target.head))])
            else :
                mod = [target.head]
                if mod:
                    mod = [w for w in mod[0].children if w.dep_ == "nmod"]
                    if mod:
                        mod = mod[0]
                        if mod.ent_type_ == 'PER':
                            relations.extend([('hasRole',str(mod), str(target))])
                        if mod.ent_type_ == 'ORG':
                            relations.extend([('roleDepartment',str(target), str(mod))])
            mod1 = [w for w in target.children if w.dep_ =='nmod' and w.ent_type_ in ["PER","ORG"]]
            if mod1:
                mod1 = mod1[0]
                if mod1.ent_type_ == 'PER':
                    relations.extend([('hasRole',str(mod1), str(target))])
                if mod1.ent_type_ == 'ORG':
                    relations.extend([('roleDepartment', str(target), str(mod1))])
            
        if target.dep_ == "acl":
            mod1 = [w for w in target.children if w.dep_ =='nmod' and w.ent_type_ in ["PER","ORG"]]
            if mod1:
                mod1 = mod1[0]
                if mod1.ent_type_ == 'PER':
                    relations.extend([('hasRole',str(mod1), str(target))])
                if mod1.ent_type_ == 'ORG':
                    relations.extend([('roleDepartment', str(target), str(mod1))])
        if target.dep_ == "nsubj":
            mod = [w for w in target.children if w.dep_ in ["obj","appos"]]
            if mod:
                mod = mod[0]
                if mod.ent_type_ == 'PER':
                    relations.extend([('hasRole',str(mod), str(target))])
                if mod.ent_type_ == 'ORG':
                    relations.extend([('roleDepartment', str(target), str(mod))])
        
        subject = [w for w in target.children if w.dep_ in ("nmod", "nsubj","obl")]
        if subject:
            if subject[0].ent_type_ == 'PER':
                relations.extend([('hasRole',str(s), str(target)) for s in subject])
            if subject[0].ent_type_ == 'ORG':
                relations.extend([('roleDepartment', str(target), str(s)) for s in subject])

    return relations

In [11]:
nlp = spacy.load("./nerav1")

In [9]:
# evaluate function takes the dataset & ner model
def evaluate(test_data, model):
    
    s, m, n = 0, 0, 0
    for item in test_data:
        test_text = item[0]
        golds = [(gold[0],gold[1][2],gold[2][2]) for gold in item[1]['relations'] if gold[0] in ['roleDepartment','hasRole']]
        #doc = model(test_text)
        pred = extract_relations(model(test_text))
        n += len(golds)
        m += len(pred)
        s += len([element for element in pred if element in golds])
        
    try:
        precision = s/m
    except ZeroDivisionError:
        precision = float('nan')

    try:
        recall = s/n    
    except ZeroDivisionError:
        recall = float('nan')
        
    try:
        f1 = 2*(recall*precision)/(recall+precision)
    except ZeroDivisionError:
        f1 = float('nan')
    
    #print("Precision : ", precision)
    #print('Recall : ', recall)
    #print('F1 score : ', f1)
    
    return precision, recall, f1

In [10]:
def overlaps(element, golds):    
    ranges = [g for g in golds if g[0]==element[0]]
    
    return any ((element[1] in g[1] or g[1] in element[1]) and (element[2] in g[2] or g[2] in element[2]) for g in ranges)

def evaluate_partials(test_data, model):
    
    s, m, n = 0, 0, 0
    for item in test_data:
        test_text = item[0]
        golds = [(gold[0],gold[1][2],gold[2][2]) for gold in item[1]['relations'] if gold[0] in ['roleDepartment','hasRole']]
        
        pred = extract_relations(model(test_text))
        n += len(golds)
        m += len(pred)
        s += len([element for element in pred if overlaps(element, golds)])
        
    try:
        precision = s/m
    except ZeroDivisionError:
        precision = float('nan')

    try:
        recall = s/n    
    except ZeroDivisionError:
        recall = float('nan')
        
    try:
        f1 = 2*(recall*precision)/(recall+precision)
    except ZeroDivisionError:
        f1 = float('nan')
    
    return precision, recall, f1