Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
410 lines (359 sloc) 19.8 KB
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
import sys,os
import re
import datetime
# Assume RDFLib 3.0
from rdflib import Namespace, URIRef, Literal, RDF, Graph
from mechanize import LinkNotFoundError
from whoosh import analysis, qparser
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from DocumentRepository import DocumentRepository
import Util
import LegalURI
from LegalRef import LegalRef, Link
from DataObjects import UnicodeStructure, CompoundStructure, Paragraph
__version__ = (1,6)
__author__ = u"Staffan Malmgren <staffan@tomtebo.org>"
class Body(CompoundStructure): pass
class ListItem(CompoundStructure): pass # needed for genshi/generic.xhtml
class EurlexCaselaw(DocumentRepository):
module_dir = "ecj" # European Court of Justice
start_url = "http://eur-lex.europa.eu/JURISIndex.do"
document_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:%s:EN:NOT"
vocab_url = "http://lagen.nu/eurlex#"
source_encoding = "utf-8"
re_celexno = re.compile('(6)(\d{4})(\w)(\d{4})(\(\d{2}\)|)')
def download_everything(self,usecache=False):
self.log.debug("Downloading, usecache is %s" % usecache)
if usecache and 'startyear' in self.moduleconfig:
startyear = int(self.moduleconfig['startyear'])
else:
startyear = 1954 # The first verdicts were published in this year
for year in range(startyear,datetime.date.today().year+1):
# We use self.configfile directly rather than
# self.moduleconfig, since the latter cannot be persisted
# across sessions (as it is a subset of a composite
# between the config file and command line options)
self.configfile[self.module_dir]['startyear'] = year
self.configfile.write()
list_url = "http://eur-lex.europa.eu/Result.do?T1=V6&T2=%d&T3=&RechType=RECH_naturel" % year
self.log.debug("Searching for %d"% year)
self.browser.open(list_url)
pagecnt = 0
done = False
while not done:
pagecnt += 1
self.log.debug("Result page #%s" % pagecnt)
# For some reason, Mechanize can't find the link to
# the HTML version of the case text. So we just get
# the whole page as a string and find unique CELEX ids
# in the tagsoup.
pagetext = self.browser.response().read()
celexnos = self.re_celexno.findall(pagetext)
for celexno in Util.uniqueList(celexnos):
# the number will be split up in components - concatenate
celexno = "".join(celexno)
# only download actual judgements
# J: Judgment of the Court
# A: Judgment of the Court of First Instance
# W: Judgement of the Civil Service Tribunal
# T: (old) Judgement of the Court
if ('J' in celexno or 'A' in celexno
or 'W' in celexno or 'T' in celexno):
if self.download_single(celexno,usecache=usecache):
self.log.info("Downloaded %s" % celexno)
else:
self.log.info("Skipped %s" % celexno)
else:
pass
#self.log.debug("Not downloading doc %s" % celexno)
# see if there are any "next" pages
try:
self.browser.follow_link(text='>')
except LinkNotFoundError:
self.log.info(u'No next page link found, we must be done')
done = True
@classmethod
def basefile_from_path(cls,path):
seg = os.path.splitext(path)[0].split(os.sep)
return "/".join(seg[seg.index(cls.module_dir)+3:])
def downloaded_path(self,basefile):
m = self.re_celexno.match(basefile)
year = m.group(2)
return os.path.sep.join([self.base_dir, self.module_dir, u'downloaded', year, basefile+'.html'])
def parsed_path(self,basefile):
m = self.re_celexno.match(basefile)
year = m.group(2)
return os.path.sep.join([self.base_dir, self.module_dir, u'parsed', year, basefile+'.xhtml'])
def distilled_path(self,basefile):
m = self.re_celexno.match(basefile)
year = m.group(2)
return os.path.sep.join([self.base_dir, self.module_dir, u'distilled', year, basefile+'.rdf'])
def parse_from_soup(self,soup,basefile):
# AVAILABLE METADATA IN CASES
#
# For now, we create a nonofficial eurlex vocab with namespace http://lagen.nu/eurlex#
# - celex number (first h1) :celex (:celexnum?)
#
# - [Title and reference]
# - decision type and date "Judgment of the Court (Third Chamber) of 17 December 2009."
# :courtdecision (as opposed to :commissiondecision)
# - :party (or parties) "M v Agence européenne des médicaments (EMEA)."
# - :referingcourt "Reference for a preliminary ruling: Administrativen sad Sofia-grad - Bulgaria."
# - :legalissue - short description and/or(?) keywords (not always present, eg 62009J0403), hyphen sep:
# - "Review of the judgment in Case T-12/08 P"
# - "Whether the state of the proceedings permits final judgment to be given"
# - "Fair hearing"
# - "Rule that the parties should be heard"
# - "Whether the unity or consistency of Community law is affected."
# - :casenum Case number + unknown letters:
# - "Case C-197/09 RX-II."
# - "Joined cases T-117/03 to T-119/03 and T-171/03."
# - :casereporter Case reporter cite "European Court reports 2009 Page 00000"
# - [Text]
# - :availablelang - Available languages ("bg", "es", "cs", "da" ....)
# - :authenticlang - Authentic language ("fr" or "French")
# - [Dates]
# - :decisiondate - Date of document (decision/judgement)
# - :applicationdate - Date of application
# - [Classifications] (different from description/keywords above)
# - :subjectmatter Subject Matter, comma sep:
# - "Staff regulations and employment conditions - EC"
# - "Provisions governing the Institutions"
# - :directorycode - Case Law Directory Code (where is the full code list?), NL sep:
# - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid"
# - "B-20.05 EEC/EC / Acts of the institutions / Statement of the reasons on which a measure is based"
# - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid"
# - "B-09.04 EEC/EC / State aid / Review of aid by the Commission - Rules of procedure"
# - [Miscellaneous information]
# - dct:author Author: "Court of Justice of the European Communities"
# - :form Form: "Judgement"
# - [Procedure]
# - :proceduretype - Type of procedure, comma sep:
# - "Staff cases"
# - "Action for damages"
# - "Appeal"
# - "REEX=OB"
# - :applicant - Applicant: "Official"
# - :defendant - Defendant: "EMEA, Institutions"
# - :observation - Observations: "Italy, Poland, Member States, European Parliament, Council, Commission, Institutions"
# - :judgerapporteur - Judge-Rapporteur: "von Danwitz"
# - :advocategeneral - Advocate General: "Mazák"
# - [Relationships between documents]
# - :treaty Treaty: "European Communities"
# - :caseaffecting Case affecting, NL-sep:
# - "Interprets [CELEXNO + pinpoint]"
# - "Declares void 61995A0091"
# - "Confirms 31996D0666"
# - :"Instruments cited in case law" (celex numbers with pinpoint locations?), nl-sep
# - "12001C/PRO/02-A61"
# - "12001C/PRO/02-NA13P1"
# - "31991Q0530-A114"
# - "62007K0023"
# - "62008A0012"
# convenience nested functions
def add_literal(predicate,literal):
g.add((URIRef(uri),
voc[predicate],
Literal(literal, lang=lang)))
def add_celex_object(predicate,celexno):
g.add((URIRef(uri),
voc[predicate],
URIRef("http://lagen.nu/ext/celex/%s" % celexno)))
def get_predicate(predicate):
predicates = list(g.objects(URIRef(uri),voc[predicate]))
return predicates != []
# These are a series of refinments for the "Affecting"
# relationship. "Cites" doesn't have these (or similar), but
# "is affected by" has (the inverse properties)
affects_predicates = {"Interprets": "interprets",
"Interprets the judgment":
"interpretsJudgment",
"Declares void": "declaresVoid",
"Confirms": "confirms",
"Declares valid (incidentally)":
"declaresValidIncidentally",
"Declares valid (by a preliminary ruling)":
"declaresValidByPreliminaryRuling",
"Incidentally declares invalid":
"declaresInvalidIncidentally",
"Declares invalid (by a preliminary ruling)":
"declaresInvalidByPreliminaryRuling",
"Amends": "amends",
"Failure concerning":"failureConcerning"}
isaffected_predicates = {"Interpreted by": "interpretedBy",
"Confirmed by": "confirmedBy",
"Declared void by": "declaredVoidBy",
"Annulment requested by":
"annulmentRequestedBy"}
# 1. Express metadata about our document as a RDF graph
g = Graph()
voc = Namespace(self.vocab_url)
g.bind('dct',self.ns['dct'])
g.bind('eurlex',voc)
# :celex - first <h1>
celexnum = soup.h1.string.strip()
if celexnum == "No documents matching criteria.":
self.log.warning("%s: No document found!" % basefile)
raise Exception("No document found!")
assert celexnum == basefile, "Celex number in file (%s) differ from filename (%s)" % (celexnum,basefile)
lang = soup.html['lang']
# 1.1 Create canonical URI for our document. To keep things
# simple, let's use the celex number as the basis (in the
# future, we should extend LegalURI to do it)
uri = "http://lagen.nu/ext/celex/%s" % celexnum
m = self.re_celexno.match(celexnum)
rdftype = {'J': voc['Judgment'],
'A': voc['JudgmentFirstInstance'],
'W': voc['JudgmentCivilService']}[m.group(3)]
g.add((URIRef(uri), RDF.type, rdftype))
add_literal('celexnum', celexnum)
# The first section, following <h2>Title and reference</h2>
# contains :courtdecision, :party (one or two items),
# :referingcourt (optional), :legalissue (list of strings),
# :casenum, :casereporter. Since some are optional, we do a
# little heuristics to find out what we're looking at at any
# given moment.
for section in soup.findAll(["h1","h2"]):
if section.name == "h1" and section.a and section.a.string == "Text":
break
if section.string == u"Title and reference":
for para in section.findNextSiblings("p"):
if not para.string: continue
string = para.string.strip()
if not get_predicate('courtdecision'): # optional: do sanitychecks to see if this really is a :courtdecision
add_literal('courtdecision',string)
elif not get_predicate('party'):
# this will be one or two items. Are they position dependent?
for party in string.split(" v "):
add_literal('party', party)
elif (not get_predicate('referingcourt') and
(string.startswith("Reference for a preliminary ruling") or
string.startswith("Preliminary ruling requested"))):
add_literal('referingcourt', string)
elif (not get_predicate('casenum') and
(string.lower().startswith("case ") or
string.lower().startswith("joined cases "))):
add_literal('casenum',string)
elif para.em: # :casereporter is enclosed in an em
for row in para.findAll(text=True):
add_literal('casereporter',row.strip())
elif get_predicate('legalissue'):
# fixme: Split this up somehow
add_literal('legalissue', string)
pass
elif section.string == "Relationship between documents":
for item in section.findNextSibling("ul").findAll("li"):
predicate = None
subpredicate = None
for node in item.childGenerator():
if not hasattr(node,"name"):
nodetext = node.strip()
if re.match("([ABCDEFGIJKLNPRST]+\d*)+$",nodetext): continue
if re.match("\d[\d\-]*[ABC]?$",nodetext): continue
if predicate == "affects" and nodetext:
if nodetext in affects_predicates:
subpredicate = affects_predicates[nodetext]
else:
self.log.warning("Can't express '%s' as a affects predicate" % nodetext)
elif predicate == "isaffected" and nodetext:
if nodetext in isaffected_predicates:
subpredicate = isaffected_predicates[nodetext]
else:
self.log.warning("Can't express '%s' as a isaffected predicate" % nodetext)
elif node.name == "strong":
subpredicate = None
if node.string == "Treaty:":
predicate = "treaty"
elif node.string == "Affected by case:":
predicate = "isaffected"
elif node.string == "Case affecting:":
predicate = "affects"
elif node.string == "Instruments cited in case law:":
predicate = "cites"
else:
self.log.warning("Don't know how to handle key '%s'" % node.string)
elif node.name == "a" and predicate:
p = predicate
if subpredicate:
p = subpredicate
# FIXME: If the
# predicate is "cites", the celex number
# may have extra crap
# (eg. "31968R0259(01)-N2A1L6") indicating
# pinpoint location. Transform these to a
# fragment identifier.
add_celex_object(p,node.string.strip())
# Process text and create DOM
self.parser = LegalRef(LegalRef.EGRATTSFALL)
body = Body()
textdiv = soup.find("div","texte")
if textdiv:
for node in textdiv.childGenerator():
if node.string:
# Here we should start analyzing for things like
# "C-197/09". Note that the Eurlex data does not use
# the ordinary hyphen like above, but rather
# 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle
# this to an ordinary hyphen.
subnodes = self.parser.parse(node.string,
predicate="dct:references")
body.append(Paragraph(subnodes))
else:
self.log.warning("%s: No fulltext available!" % celexnum)
return {'meta':g,
'body':body,
'lang':'en',
'uri':uri}
@classmethod
def relate_all_setup(cls, config):
if ('whoosh_indexing' in config[cls.module_dir] and
config[cls.module_dir]['whoosh_indexing'] == 'True'):
print "We're doing whoosh_indexing!"
create_whoosh_index(cls)
else:
print "No whoosh_indexing :-("
super(EurlexCaselaw,cls).relate_all_setup(config)
def create_whoosh_index(cls):
indexdir = os.path.sep.join([config['datadir'],cls.module_dir,'index'])
if not os.path.exists(indexdir):
os.mkdir(indexdir)
print "Creating a new index"
ana = analysis.StemmingAnalyzer()
schema = Schema(title=TEXT(stored=True),
basefile=ID(stored=True, unique=True),
content=TEXT)
# FIXME: Get a keyword list, correct title, and list of treaty
# references (celex nums as keywords or uris or...)
whoosh_ix = create_in(indexdir, schema)
base_dir = config['datadir']
from time import time
for basefile in cls.get_iterable_for("relate_all",base_dir):
if not ("J" in basefile or "A" in basefile or "K" in basefile):
continue
readstart = time()
# just save the text from the document, strip out the tags
from BeautifulSoup import BeautifulSoup
m = cls.re_celexno.match(basefile)
year = m.group(2)
parsed_file = os.path.sep.join([base_dir, cls.module_dir, u'parsed', year, basefile+'.xhtml'])
soup = BeautifulSoup(open(parsed_file).read())
text = ''.join(soup.findAll(text=True))
# Skip the first 150 chars (XML junk) and normalize space
text = ' '.join(text[150:].split())
if text:
indexstart = time()
writer = whoosh_ix.writer()
writer.update_document(title="Case "+ basefile,basefile=basefile,content=text)
writer.commit()
print "Added %s '%s...' %.1f kb in %.3f + %.3f s" % (basefile, text[:39], len(text)/1024, indexstart-readstart, time()-indexstart)
else:
print "Noadd %s (no text)" % (basefile)
searcher = whoosh_ix.searcher()
results = searcher.find("content", "quantitative imports equivalent prohibited", limit=10)
for i in range(len(results)):
print "%s: %s" % (results[i]['title'], results.score(i))
if __name__ == "__main__":
EurlexCaselaw.run()