Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 354 lines (307 sloc) 11.6 KB
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
# Script to build graphwiz graphs from RDF data
import os
import sys
import locale
import codecs
import subprocess
from collections import defaultdict
from time import time
from StringIO import StringIO
import xml.etree.cElementTree as ET
try:
from rdflib.Graph import Graph
except ImportError:
from rdflib import Graph
from rdflib import URIRef, Literal
import Util
from SesameStore import SesameStore
locale.setlocale(locale.LC_ALL,'')
if sys.platform == 'win32':
if sys.stdout.encoding:
defaultencoding = sys.stdout.encoding
else:
defaultencoding = 'cp850'
else:
if sys.stdout.encoding:
defaultencoding = sys.stdout.encoding
else:
defaultencoding = locale.getpreferredencoding()
# print "setting sys.stdout to a '%s' writer" % defaultencoding
sys.stdout = codecs.getwriter(defaultencoding)(sys.__stdout__, 'replace')
sys.stderr = codecs.getwriter(defaultencoding)(sys.__stderr__, 'replace')
def build_rdf_graph(d):
start = time()
g = Graph()
c = 0
for f in os.listdir(d):
if f.endswith(".xht2"):
c += 1
try:
g.load(d + os.path.sep + f, format="rdfa")
sys.stdout.write(".")
except Exception:
print "loading %r failed" % f
sys.stdout.write("\n")
print "Graph with %d triples loaded from %d files in %.3f sec" % (len(g), c, time() - start)
return g
def build_dotfile_from_rdf_graph(g):
start = time()
objs = defaultdict(dict)
# out = sys.stdout
out = StringIO()
out.write(u"""digraph G {
graph [fontname = "Arial",
fontsize = 16,
overlap = compress,
model = subset,
label = "%s",
];
node [ shape = box,
style = rounded,
fontname = "Arial",
fontsize= 10
""" % sys.argv[1].decode('iso-8859-1').replace("\\", "\\\\"))
identifier = URIRef(u'http://purl.org/dc/terms/identifier')
description = URIRef(u'http://purl.org/dc/terms/description')
dctsubject = URIRef(u'http://purl.org/dc/terms/subject')
rattsfall = URIRef(u'http://rinfo.lagrummet.se/taxo/2007/09/rinfo/pub#rattsfallshanvisning')
for (o,p,s) in g:
if isinstance(s, Literal):
objs[o][p] = Util.normalizeSpace(s)
else:
objs[o][p] = s
# Build nodes for all primary objects
for k in objs.keys():
try:
idstr = Util.normalizeSpace(objs[k][identifier])
except KeyError:
continue
try:
descstr = Util.normalizeSpace(objs[k][description][:20])
except KeyError:
print "Description for %s is missing" % k
descstr = ""
out.write(u"\"%s\"" % idstr)
out.write(u" [label=<%s<BR/>%s...>];\n" %
(idstr,
descstr))
# build nodes for all dct:subjects ("Sokord")
dctsubjects = {}
for (subj, obj) in g.subject_objects(dctsubject):
obj = Util.normalizeSpace(obj)
dctsubjects[obj] = 1;
for k in dctsubjects.keys():
out.write(u"\"%s\"" % k)
out.write(u" [shape=ellipse]\n")
# Build relations between primary objects
# for k in objs.keys():
# for s in g.subjects(rattsfall,k):
# if s in objs:
# try:
# out.write(u"\"%s\" -> \"%s\"\n" %
# (Util.normalizeSpace(objs[s][identifier]),
# Util.normalizeSpace(objs[k][identifier])))
# except KeyError:
# continue
# else:
# print "Couldn't map %s to %s: No identifier" % (objs[k][identifier], s)
#
# build relations between primary objects and their dct:subjects
for (subj, obj) in g.subject_objects(dctsubject):
obj = Util.normalizeSpace(obj)
out.write("\"%s\" -> \"%s\"\n" % (objs[subj][identifier], obj))
out.write("}")
print "dot graph created in %.3f sec" % (time() -start)
return out
def create_graph(f, engine="dot", arguments="", filename="tmp.png", filetype="png"):
start = time()
dotfile = codecs.open("tmp.dot","w",encoding="utf-8", errors="replace")
v = f.getvalue()
dotfile.write(v)
dotfile.close()
cmdline = "%s %s -T%s -o%s tmp.dot" % (engine, arguments, filetype, filename)
print "Running %s" % cmdline
p = subprocess.Popen(cmdline, shell=True)
ret = p.wait()
print "Graph %s created in %.3f sec" % (filename, time() - start)
def build_csvfile_from_sparql_results(res):
print "Writing %s rows to CSV file, YAY!" % len(res)
import csv
writer = csv.DictWriter(open("out.csv","w"), ['subj','pred','obj'])
writer.writerows(res)
def sparql_select(sq):
# store = SesameStore("http://localhost:8080/openrdf-sesame", "lagen.nu")
store = SesameStore("http://localhost:8080/openrdf-sesame", "mysite")
results = store.select(sq)
tree = ET.fromstring(results)
res = []
resultnodes = tree.findall(".//{http://www.w3.org/2005/sparql-results#}result")
print "%s rows in query result" % len(resultnodes)
for row in resultnodes:
d = {}
for element in row:
key = element.attrib['name']
value = element[0].text
value = value.split("/")[-1]
if "-" in value:
value = value.split("-")[0]
d[key] = value.encode("utf-8")
res.append(d)
return res
def build_dotfile_from_sparql_results(nodes, links, graphname):
specials = {"61991J0267":"Keck\\n(C-267/91)",
"61978J0120":"Cassis\\n(120/78)",
"61974J0008":"Dassonville\\n(8/74)",
"61984J0178":"German Beer\\n(178/84)",
"61982J0174":"Sandoz\\n(174/82)",
"61975J0104":"de Peijper\\n(104/75)",
"61988J0202":"Terminal equipment\\n(C-202/88)",
"61993J0427":"Bristol-Myers Squibb\\n(C-427/93)",
#"61991J0146":"KYDEP\\n(C-146/91)",
#"61993J0415":"Bosman\\(C-415/93)",
#"61998J0379":"PreussenElektra\\n(C-379/98)",
"62001J0101":"Bodil",
#"61001A0006":"Matrazen\\n(T-6/01)",
"62000J0465":u"Österreich. Rundfunk"}
start = time()
objs = defaultdict(dict)
out = StringIO()
out.write(u"""digraph G {
graph [fontname = "Arial",
fontsize = 16,
overlap = compress,
model = subset,
label = "%s",
size=20,
];
node [ shape = box,
style = rounded,
fontname = "Arial",
fontsize= 10,
URL="http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:\\N:EN:NOT"
];
""" % graphname)
sizes = defaultdict(int)
yearnodes = defaultdict(list)
seen = {}
for row in links:
year = row['subj'][1:5]
yearnodes[year].append(row['subj'])
seen_links = {}
seen_nodes = {}
for row in nodes:
# if not row['subj'] in seen_nodes:
# print "Now I've seen %s" % row['subj']
seen_nodes[row['subj']] = True
sizes[row['subj']] = 0
for row in links:
if repr(row) in seen_links:
continue
seen_links[repr(row)] = True
seen_nodes[row['subj']] = True
sizes[row['obj']] += 1;
seen_links = {}
for row in links:
if repr(row) in seen_links:
# print " Throwing away %s" % repr(row)
continue
# print "Continuing with %s" % repr(row)
seen_links[repr(row)] = True
try:
if sizes[row['obj']] > 0:
out.write(u" \"%s\" -> \"%s\";\n" % (row['subj'], row['obj']))
except UnicodeDecodeError:
pass
for (node,size) in sizes.items():
# if not size > 0:
# continue
if node in specials:
out.write(u" \"%s\" [width=%.1f, height=%.1f, fixedsize=True, fontsize=%.1f, color=lightblue2, style=\"filled,rounded\", shape=box, label=\"%s\\n%s\\n[%d]\"];\n" % (node,size/1.3,size/2, (size*4)+8, specials[node],node, size))
else:
out.write(u" \"%s\" [width=%.1f, height=%.1f, fontsize=%.1f, label=\"%s\\n[%s]\"];\n" % (node,size/1.3,size/2,(size*4)+8, node, size))
out.write(u"}\n");
print "dot graph (%s nodes, %s vertices) created in %.3f sec" % (len(seen_nodes),len(seen_links),time() -start)
return out
def query_cite(celexid):
# mapping between the most recent article (eg art 34 TFEU) and its
# previous incarnations (article 28 EC, article 30 EEC etc)
equiv = {'12008E034':['11997E028','11992E030','11957E030'],
'12008E036':['11997E030','11992E036','11957E036'],
'12008E267':['11997E234','11992E177','11957E177']}
q = "{ ?subj eurlex:cites <http://lagen.nu/ext/celex/%s> }\n" % celexid
if celexid in equiv:
for e in equiv[celexid]:
q += " UNION { ?subj eurlex:cites <http://lagen.nu/ext/celex/%s> }\n" % e
return """
PREFIX eurlex:<http://lagen.nu/eurlex#>
SELECT DISTINCT ?subj ?pred ?obj WHERE {
%s
FILTER (regex(str(?obj), "^http://lagen.nu/ext/celex/6") && ?pred = eurlex:cites)
}
""" % q
def query_link(celexid):
# mapping between the most recent article (eg art 34 TFEU) and its
# previous incarnations (article 28 EC, article 30 EEC etc)
equiv = {'12008E034':['11997E028','11992E030','11957E030'],
'12008E036':['11997E030','11992E036','11957E036'],
'12008E267':['11997E234','11992E177','11957E177']}
q = "{ ?subj eurlex:cites <http://lagen.nu/ext/celex/%s> }\n" % celexid
if celexid in equiv:
for e in equiv[celexid]:
q += " UNION { ?subj eurlex:cites <http://lagen.nu/ext/celex/%s> }\n" % e
return """
PREFIX eurlex:<http://lagen.nu/eurlex#>
SELECT DISTINCT ?subj ?pred ?obj WHERE {
?subj ?pred ?obj .
%s
FILTER (regex(str(?obj), "^http://lagen.nu/ext/celex/6") && ?pred = eurlex:cites)
}
""" % q
# matches the prefix of the URL, ie 31996L0009 will match
# <http://lagen.nu/ext/celex/31996L0009> but also
# <http://lagen.nu/ext/celex/31996L0009-A03P1>
def query_citeroot(celexid):
return """
PREFIX eurlex:<http://lagen.nu/eurlex#>
SELECT DISTINCT ?subj ?pred ?obj WHERE {
?subj ?pred ?obj .
FILTER (regex(str(?obj), "^http://lagen.nu/ext/celex/%s") &&
(?pred = eurlex:cites || ?pred = eurlex:interprets))
}
""" % celexid
def query_linkroot(celexid):
return """
PREFIX eurlex:<http://lagen.nu/eurlex#>
SELECT DISTINCT ?subj ?pred ?obj WHERE {
?subj ?pred ?obj .
?obj ?pred2 ?obj2 .
?subj ?pred3 ?obj3 .
FILTER (regex(str(?obj2), "^http://lagen.nu/ext/celex/%s") &&
regex(str(?obj3), "^http://lagen.nu/ext/celex/%s") &&
(?pred = eurlex:cites || ?pred = eurlex:interprets))
}
""" % (celexid,celexid)
if __name__ == "__main__":
querytype = sys.argv[1]
queryarg = sys.argv[2]
if len(sys.argv) <= 3:
outfile = querytype+"_"+queryarg+".pdf"
else:
outfile = sys.argv[3]
filetype = outfile.split(".")[1]
if querytype == "cite":
nodes_sq = query_cite(queryarg)
links_sq = query_link(queryarg)
elif querytype == "citeroot":
nodes_sq = query_citeroot(queryarg)
links_sq = query_linkroot(queryarg)
else:
raise ValueError("Unknown query type %s" % querytype)
print nodes_sq
nodes = sparql_select(nodes_sq)
print links_sq
links = sparql_select(links_sq)
f = build_dotfile_from_sparql_results(nodes, links, querytype + " " + queryarg)
create_graph(f,engine="dot", filename=outfile, filetype=filetype)