Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
662 lines (582 sloc) 26.7 KB
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
import sys,os
import re
import datetime
from collections import deque, defaultdict
import xml.etree.cElementTree as ET
import xml.etree.ElementTree as PET
try:
from rdflib import Namespace, URIRef, Literal, RDF, BNode, Collection
from rdflib.Graph import Graph
except ImportError:
from rdflib import Namespace, URIRef, Literal, RDF, BNode
from rdflib import Graph
from rdflib.collection import Collection
from whoosh import analysis, fields, formats, query, qparser, scoring
from whoosh.filedb.filestore import RamStorage, FileStorage
from DocumentRepository import DocumentRepository
import Util
import LegalURI
from LegalRef import LegalRef, Link
from DataObjects import UnicodeStructure, CompoundStructure, OrdinalStructure, serialize
__version__ = (1,6)
__author__ = u"Staffan Malmgren <staffan@tomtebo.org>"
# The general outline of a treaty is:
# <Body> C
# <Paragraph> C (unicode/Link) - starting and ending titles
# <Preamble> C
# <Paragraph> - the typographic term, aka "Stycke"
# <Part> CO - not present for TEU
# <Title> CO
# <Chapter> CO
# <Section> CO
# <Article> CO
# <Subarticle> CO
# <Paragraph> C
# <unicode>
# <Link>
# <UnordedList leader="dash"> C
# <ListItem> C
# <OrderedList type="letter"> CO
class IDStructure(object):
id = None
attrs = None
class Body(CompoundStructure, IDStructure): pass
class Paragraph(CompoundStructure, IDStructure): pass
class Preamble(CompoundStructure, IDStructure): pass
class Part(CompoundStructure, IDStructure, OrdinalStructure): pass
class Title(CompoundStructure, IDStructure, OrdinalStructure): pass
class Chapter(CompoundStructure, IDStructure, OrdinalStructure): pass
class Section(CompoundStructure, IDStructure, OrdinalStructure): pass
class Article(CompoundStructure, IDStructure, OrdinalStructure):
fragment_label = "A"
rdftype = "eurlex:Article"
class Subarticle(CompoundStructure, IDStructure, OrdinalStructure):
fragment_label = "P"
rdftype = "eurlex:Subarticle"
class UnorderedList(CompoundStructure, IDStructure): pass
class OrderedList(CompoundStructure, IDStructure, OrdinalStructure): pass
class ListItem(CompoundStructure, IDStructure):
fragment_label = "L"
rdftype = "eurlex:ListItem"
DCT = Namespace(Util.ns['dct'])
XSD = Namespace(Util.ns['xsd'])
RINFO = Namespace(Util.ns['rinfo'])
RINFOEX = Namespace(Util.ns['rinfoex'])
EX = Namespace(URIRef("http://www.example.org/"))
class EurlexTreaties(DocumentRepository):
# overrides of superclass variables
module_dir = "eut" # European Union Treaties
start_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:C:2008:115:0001:01:EN:HTML"
document_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=OJ:C:2008:115:0001:01:EN:HTML#%s"
source_encoding = "utf-8"
genshi_tempate = "genshi/supergeneric.xhtml"
# own class variables
vocab_url = Util.ns['eurlex']
def download_everything(self,cache=False):
self.log.info("Hello")
self.download_single("teu")
self.download_single("tfeu")
re_part = re.compile("PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN)$").match
re_title = re.compile("TITLE ([IVX]+)$").match
re_chapter = re.compile("CHAPTER (\d+)$").match
re_section = re.compile("SECTION (\d+)$").match
re_article = re.compile("Article (\d+)$").match
re_subarticle = re.compile("^(\d+)\. ").search
re_unorderedliststart = re.compile("^- ").search
re_orderedliststart = re.compile("^\(\w\) ").search
re_romanliststart = re.compile("^\([ivx]+\) ").search
ordinal_list = ('ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', 'SIX', 'SEVEN',
'EIGHT', 'NINE', 'TEN', 'ELEVEN', 'TWELVE')
ordinal_dict = dict(zip(ordinal_list, range(1,len(ordinal_list)+1)))
# Example code from http://www.diveintopython.org/
roman_numeral_map = (('M', 1000),
('CM', 900),
('D', 500),
('CD', 400),
('C', 100),
('XC', 90),
('L', 50),
('XL', 40),
('X', 10),
('IX', 9),
('V', 5),
('IV', 4),
('I', 1))
def _from_roman(self,s):
"""convert Roman numeral to integer"""
result = 0
index = 0
for numeral, integer in self.roman_numeral_map:
while s[index:index+len(numeral)] == numeral:
result += integer
index += len(numeral)
return result
def parse_from_soup(self,soup,basefile):
g = Graph()
self.log.info("%s: Parsing" % basefile)
if basefile == "teu":
# FIXME: Use a better base URI?
uri = 'http://rinfo.lagrummet.se/extern/celex/12008M'
startnode = soup.findAll(text="-"*50)[1].parent
g.add((URIRef(uri),DCT['title'],Literal("Treaty on European Union")))
elif basefile == "tfeu":
uri = 'http://rinfo.lagrummet.se/extern/celex/12008E'
startnode = soup.findAll(text="-"*50)[2].parent
g.add((URIRef(uri),DCT['title'],Literal("Treaty on the Functioning of the European Union")))
lines = deque()
for p in startnode.findNextSiblings("p"):
if p.string == "-" * 50:
self.log.info("found the end")
break
else:
if p.string:
lines.append(unicode(p.string))
self.log.info("%s: Found %d lines" % (basefile,len(lines)))
body = self.make_body(lines)
self.process_body(body, '', uri)
# print serialize(body)
return {'meta':g,
'body':body,
'lang':'en',
'uri':uri}
# To make Paragraph and our other stuff available to Genshi
def get_globals(self):
return globals()
def make_body(self,lines):
b = Body()
while lines:
line = lines.popleft()
if line == "PREAMBLE":
b.append(self.make_preamble(lines))
elif self.re_title(line):
lines.appendleft(line)
b.append(self.make_title(lines))
elif self.re_part(line):
lines.appendleft(line)
b.append(self.make_part(lines))
else:
b.append(Paragraph([line]))
# print type(b[-1])
return b
def make_preamble(self,lines):
p = Preamble(title="PREAMBLE")
while lines:
line = lines.popleft()
if (self.re_part(line) or self.re_title(line)):
lines.appendleft(line)
return p
else:
p.append(Paragraph([line]))
self.log.warn("make_preamble ran out of lines!")
return p #
def make_part(self,lines):
partnumber = lines.popleft()
ordinal = self.ordinal_dict[self.re_part(partnumber).group(1)]
parttitle = lines.popleft()
p = Part(ordinal=ordinal,ordinaltitle=partnumber,title=parttitle)
while lines:
line = lines.popleft()
if (self.re_part(line)):
lines.appendleft(line)
return p
elif (self.re_title(line)):
lines.appendleft(line)
p.append(self.make_title(lines))
elif (self.re_article(line)):
# print "make_part: %s matches article" % line
lines.appendleft(line)
p.append(self.make_article(lines))
else:
p.append(Paragraph([line]))
self.log.warn("make_part appended naked Paragraph '%s...'" % line[:25])
return p
def make_title(self,lines):
titlenumber = lines.popleft()
ordinal = self._from_roman(self.re_title(titlenumber).group(1))
titletitle = lines.popleft()
t = Title(ordinal=ordinal, ordinaltitle=titlenumber, title=titletitle)
while lines:
line = lines.popleft()
if (self.re_part(line) or self.re_title(line)):
lines.appendleft(line)
return t
elif (self.re_chapter(line)):
lines.appendleft(line)
t.append(self.make_chapter(lines))
elif (self.re_article(line)):
# print "make_title: %s matches article" % line
lines.appendleft(line)
t.append(self.make_article(lines))
else:
t.append(Paragraph([line]))
self.log.warn("make_title appended naked Paragraph '%s...'" % line[:25])
return t
def make_chapter(self,lines):
chapternumber = lines.popleft()
ordinal = int(self.re_chapter(chapternumber).group(1))
chaptertitle = lines.popleft()
c = Chapter(ordinal=ordinal, ordinaltitle=chapternumber,title=chaptertitle)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line)):
lines.appendleft(line)
return c
elif (self.re_section(line)):
lines.appendleft(line)
c.append(self.make_section(lines))
elif (self.re_article(line)):
# print "make_chapter: %s matches article" % line
lines.appendleft(line)
c.append(self.make_article(lines))
else:
c.append(Paragraph([line]))
self.log.warn("make_chapter appended naked Paragraph '%s...'" % line[:25])
return c
def make_section(self,lines):
sectionnumber = lines.popleft()
ordinal = int(self.re_section(sectionnumber).group(1))
sectiontitle = lines.popleft()
s = Section(ordinal=ordinal, ordinaltitle=sectionnumber,title=sectiontitle)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line) or
self.re_section(line)):
lines.appendleft(line)
return s
elif (self.re_article(line)):
# print "make_section: %s matches article" % line
lines.appendleft(line)
s.append(self.make_article(lines))
else:
s.append(Paragraph([line]))
self.log.warn("make_section appended naked Paragraph '%s...'" % line[:25])
return s
def make_article(self,lines):
articlenumber = lines.popleft()
ordinal = int(self.re_article(articlenumber).group(1))
self.log.info("Making article: %s" % ordinal)
exarticlenumber = lines.popleft()
if not exarticlenumber.startswith("(ex Article"):
lines.appendleft(exarticlenumber)
a = Article(ordinal=ordinal, ordinaltitle=articlenumber)
else:
a = Article(ordinal=ordinal, ordinaltitle=articlenumber, exarticlenumber=exarticlenumber)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line) or
self.re_section(line) or
self.re_article(line)):
lines.appendleft(line)
return a
elif (self.re_subarticle(line)):
lines.appendleft(line)
a.append(self.make_subarticle(lines))
elif (self.re_unorderedliststart(line)):
lines.appendleft(line)
a.append(self.make_unordered_list(lines,"dash"))
elif (self.re_orderedliststart(line)):
lines.appendleft(line)
a.append(self.make_ordered_list(lines,"lower-alpha"))
else:
# print "Appending %s" % line[:40]
a.append(Paragraph([line]))
return a
def make_subarticle(self,lines):
line = lines.popleft()
subarticlenum = int(self.re_subarticle(line).group(1))
# self.log.info("Making subarticle %d: %s" % (subarticlenum, line[:30]))
s = Subarticle(ordinal=subarticlenum)
lines.appendleft(line)
while lines:
line = lines.popleft()
if (self.re_part(line) or
self.re_title(line) or
self.re_chapter(line) or
self.re_section(line) or
self.re_article(line)):
lines.appendleft(line)
return s
elif (self.re_subarticle(line) and
int(self.re_subarticle(line).group(1)) != subarticlenum):
lines.appendleft(line)
return s
elif (self.re_unorderedliststart(line)):
lines.appendleft(line)
s.append(self.make_unordered_list(lines,"dash"))
elif (self.re_orderedliststart(line)):
lines.appendleft(line)
s.append(self.make_ordered_list(lines,"lower-alpha"))
else:
# this is OK
s.append(Paragraph([line]))
return s
def make_unordered_list(self,lines,style):
ul = UnorderedList(style=style)
while lines:
line = lines.popleft()
if not self.re_unorderedliststart(line):
lines.appendleft(line)
return ul
else:
ul.append(ListItem([line]))
return ul
def make_ordered_list(self,lines,style):
ol = OrderedList(style=style)
while lines:
line = lines.popleft()
# try romanliststart before orderedliststart -- (i) matches
# both, but is likely the former
if self.re_romanliststart(line):
# print "make_ordered_list: re_romanliststart: %s" % line[:40]
if style=="lower-roman":
ol.append(ListItem([line]))
else:
lines.appendleft(line)
ol.append(self.make_ordered_list(lines,"lower-roman"))
elif self.re_orderedliststart(line):
# print "make_ordered_list: re_orderedliststart: %s" % line[:40]
if style=="lower-alpha":
ol.append(ListItem([line]))
else: # we were in a roman-style sublist, so we should pop up
lines.appendleft(line)
return ol
else:
# print "make_ordered_list: done: %s" % line[:40]
lines.appendleft(line)
return ol
return ol
# Post-process the document tree in a recursive fashion in order to:
#
# Find addressable units (resources that should have unique URI:s,
# e.g. articles and subarticles) and construct IDs for them, like
# "A7", "A25(b)(ii)" (or A25S1P2N2 or...?)
#
# How should we handle Articles themselves -- they have individual
# CELEX numbers and therefore URIs (but subarticles don't)?
def process_body(self, element, prefix, baseuri):
if type(element) == unicode:
return
# print "Starting with " + str(type(element))
counters = defaultdict(int)
for p in element:
counters[type(p)] += 1
# print "handling " + str(type(p))
if hasattr(p, 'fragment_label'): # this is an addressable resource
elementtype = p.fragment_label
if hasattr(p,'ordinal'):
elementordinal = p.ordinal
else:
elementordinal = counters[type(p)]
fragment = "%s%s%s" % (prefix, elementtype, elementordinal)
if elementtype == "A":
uri = "%s%03d" % (baseuri, elementordinal)
else:
uri = "%s%s%s" % (baseuri, elementtype, elementordinal)
p.id = fragment
p.attrs = {'id':p.id,
'about':uri,
'typeof':p.rdftype}
if elementtype == "A":
uri += "#"
else:
fragment = prefix
uri = baseuri
self.process_body(p,fragment,uri)
def prep_annotation_file(self,basefile):
print "prep_annotation_file"
baseline = self.ranked_set_baseline(basefile)
# goldstandard = self.ranked_set_goldstandard(basefile)
rs2 = self.ranked_set_fake2(basefile)
rs3 = self.ranked_set_fake3(basefile)
rs4 = self.ranked_set_fake4(basefile)
# goldstandard = {'1': ['62009J0014','62009J0197','62009J0357','62009J0403','62009A0027']}
# self.calculate_map(rs1,goldstandard)
goldstandard = {'1': [['62009J0014',u'Genc v Land Berlin (100%)'],
['62009J0197',u'Agence européenne des médicaments (90%)'],
['62009J0357',u'Huchbarov (80%)'],
['62009J0403',u'Jasna Deticke (70%)'],
['62009A0027',u'Stella Kunststofftechnik(60%)']]}
sets = [{'label':'Baseline',
'data':baseline},
{'label':'Gold standard',
'data':goldstandard}]
g = Graph()
g.bind('dct',self.ns['dct'])
g.bind('rinfoex',self.ns['rinfoex'])
XHT_NS = "{http://www.w3.org/1999/xhtml}"
tree = ET.parse(self.parsed_path(basefile))
els = tree.findall("//"+XHT_NS+"div")
articles = []
for el in els:
if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article":
article = unicode(el.attrib['id'][1:])
articles.append(article)
for article in articles:
print "Results for article %s" % article
articlenode = URIRef("http://rinfo.lagrummet.se/extern/celex/12008E%03d" % int(article))
resultsetcollectionnode = BNode()
g.add((resultsetcollectionnode, RDF.type, RDF.List))
rc = Collection.Collection(g,resultsetcollectionnode)
g.add((articlenode, DCT["relation"], resultsetcollectionnode))
for s in sets:
resultsetnode = BNode()
listnode = BNode()
rc.append(resultsetnode)
g.add((resultsetnode, RDF.type, RINFOEX["RelatedContentCollection"]))
g.add((resultsetnode, DCT["title"], Literal(s["label"])))
g.add((resultsetnode, DCT["hasPart"], listnode))
c = Collection.Collection(g,listnode)
g.add((listnode, RDF.type, RDF.List))
if article in s['data']:
print " Set %s" % s['label']
for result in s['data'][article]:
resnode = BNode()
g.add((resnode, DCT["references"], Literal(result[0])))
g.add((resnode, DCT["title"], Literal(result[1])))
c.append(resnode)
print " %s" % result[1]
# self.graph_to_image(g,"png",self.annotation_path(basefile)+".png")
return self.graph_to_annotation_file(g, basefile)
def graph_to_image(self,graph,imageformat,filename):
import pydot
import rdflib
dot = pydot.Dot()
dot.progs = {"dot": "c:/Program Files/Graphviz2.26.3/bin/dot.exe"}
# dot.progs = {"dot": "c:/Program Files (x86)/Graphviz2.26.3/bin/dot.exe"}
# code from rdflib.util.graph_to_dot, but adjusted to handle unicode
nodes = {}
for s, o in graph.subject_objects():
for i in s,o:
if i not in nodes.keys():
if type(i) == rdflib.BNode:
nodes[i] = repr(i)[7:]
elif type(i) == rdflib.Literal:
nodes[i] = repr(i)[16:-1]
elif type(i) == rdflib.URIRef:
nodes[i] = repr(i)[22:-2]
for s, p, o in graph.triples((None,None,None)):
dot.add_edge(pydot.Edge(nodes[s], nodes[o], label=repr(p)[22:-2]))
print "Writing %s format to %s" % (imageformat, filename)
Util.ensureDir(filename)
dot.write(path=filename,prog="dot",format=imageformat)
print "Wrote %s" % filename
def calculate_map(self,rankedset,goldstandard):
aps = []
for key in goldstandard.keys():
ranking = [x[0] for x in rankedset[key]]
precisions = []
relevant_hits = 0
total_hits = 0
for r in ranking:
total_hits += 1
if r in goldstandard[key]:
# if r in goldstandard[key][:total_hits]:
relevant_hits += 1
precisions.append(relevant_hits/float(total_hits))
print " Precision at %s: %d" % (total_hits, precisions[total_hits-1])
ap = sum(precisions) / float(len(precisions))
print " Average precision: %s" % ap
aps.append(ap)
res = sum(aps) / float(len(aps))
print "Mean average precision: %s" % res
return res
def ranked_set_fake1 (self,basefile):
return {'1': [['62009J0014',u'Genc v Land Berlin (100%)'],
['62009J0197',u'Agence européenne des médicaments (90%)'],
['62009J0357',u'Huchbarov (80%)'],
['62009J0403',u'Jasna Deticke (70%)'],
['62009A0027',u'Stella Kunststofftechnik(60%)']]}
def ranked_set_fake2 (self,basefile):
return {'1': [['62009J0197',u'Agence européenne des médicaments (100%)'],
['62009J0014',u'Genc v Land Berlin (90%)'],
['62009J0357',u'Huchbarov (80%)'],
['62009J0403',u'Jasna Deticke (70%)'],
['62009A0027',u'Stella Kunststofftechnik(60%)']]}
def ranked_set_fake3 (self,basefile):
return {'1': [['62009J0357',u'Huchbarov (100%)'],
['62009J0403',u'Jasna Deticke (90%)'],
['62009J0014',u'Genc v Land Berlin (800%)'],
['62009J0187',u'Commission v United Kingdom (70%)'],
['62009A0027',u'Stella Kunststofftechnik(60%)']]}
def ranked_set_fake4 (self,basefile):
return {'1': [['62009J0014',u'Genc v Land Berlin (100%)'],
['62009J0197',u'Agence européenne des médicaments (90%)'],
['62009J0357',u'Huchbarov (80%)'],
['62009A0027',u'Stella Kunststofftechnik(70%)'],
['62009J0403',u'Jasna Deticke (60%)']
]}
# computes a ranked set for each baseline using a naive search
# (using the most significant words of each article) and the
# standard BM25F ranking function
def ranked_set_baseline(self,basefile):
# Helper from http://effbot.org/zone/element-lib.htm
def flatten(elem, include_tail=0):
text = elem.text or ""
for e in elem:
text += flatten(e, 1)
if include_tail and elem.tail: text += elem.tail
return text
# step 1: Create a temporary whoosh index in order to find out
# the most significant words for each article
ana = analysis.StandardAnalyzer()
# ana = analysis.StemmingAnalyzer()
vectorformat = formats.Frequency(ana)
schema = fields.Schema(article=fields.ID(unique=True),
title=fields.TEXT(stored=True),
content=fields.TEXT(analyzer=ana,
vector=vectorformat))
st = RamStorage()
tmpidx = st.create_index(schema)
w = tmpidx.writer()
XHT_NS = "{http://www.w3.org/1999/xhtml}"
tree = ET.parse(self.parsed_path(basefile))
els = tree.findall("//"+XHT_NS+"div")
articles = []
for el in els:
if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article":
text = Util.normalizeSpace(flatten(el))
article = unicode(el.attrib['id'][1:])
articles.append(article)
w.update_document(article=article,title="Article "+ article,content=text)
w.commit()
self.log.info("Indexed %d articles" % len(articles))
# Step 2: Open the large whoosh index containing the text of
# all cases. Then, for each article, use the 20 most distinctive terms
# (filtering away numbers) to create a query against that index
# things to vary:
# * numterms
# * connector (AND or OR)
# * scoring (weighting=scoring.Cosine())
numterms = 5
connector = " AND "
indexdir = os.path.sep.join([self.config['datadir'],'ecj','index'])
storage = FileStorage(indexdir)
idx = storage.open_index()
searcher = idx.searcher(weighting=scoring.BM25F())
tempsearch = tmpidx.searcher()
rankedset = {}
for article in articles:
rankedset[article] = []
r = tempsearch.search(query.Term("article",article))
terms = [t[0] for t in r.key_terms("content", numterms=numterms+1) if not t[0].isdigit()][:numterms]
print "Article %s:%r" % (article, terms)
parser = qparser.QueryParser("content")
q = parser.parse(connector.join(terms))
results = searcher.search(q, limit=10)
resultidx = 0
for result in results:
reslbl = "%s (%s)"%(result['title'],results.score(resultidx))
rankedset[article].append([result['basefile'],reslbl])
print u"\t%s (%s)" % (result['title'], results.score(resultidx))
resultidx += 1
return rankedset
if __name__ == "__main__":
EurlexTreaties.run()