In [8]:
from SPARQLWrapper import SPARQLWrapper, JSON
import os
import pandas as pd
import caselawnet

%matplotlib inline

[nltk_data] Downloading package punkt to /home/dafne/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
sparql = SPARQLWrapper("http://localhost:9999/blazegraph/namespace/kb/sparql")

In [9]:
outpath = '/media/sf_VBox_Shared/CaseLaw/graphs/lido/'

In [3]:
def sparql_result_to_df(result):
    cols = result['head']['vars']
    rows_dicts = result['results']['bindings']
    rows = [{c: r.get(c, {'value': None})['value'] for c in cols} for r in rows_dicts]
    return pd.DataFrame(rows)[cols]

In [4]:
# Nodes: all jurispidentie that includes a creator
queryString = """
    prefix dcterm: <http://purl.org/dc/terms/> 
    select ?id ?creator ?title
    {
        ?id dcterm:type "Jurisprudentie".
         ?id dcterm:creator ?creator. 
        optional { ?id dcterm:title ?title }
    }
"""

sparql.setQuery(queryString)
sparql.setReturnFormat(JSON)
ret = sparql.query()
result = ret.convert()
cases = sparql_result_to_df(result)
ecli_regex = '(ECLI:[A-Z]{2}:[A-Z]*:[0-9]{4}:[0-9A-Z\.]{1,25}$)'
cases['ecli'] = cases.id.str.extract(ecli_regex, expand=False)
cases['court'] = cases.ecli.str.extract('ECLI:[A-Z]{2}:([A-Z]*):[0-9]{4}:[0-9A-Z\.]{1,25}$', expand=False)
cases.shape

(68339, 5)

In [6]:
cases['court'].value_counts()

HR    68339
Name: court, dtype: int64

In [None]:
## TODO: there's duplicate ids

In [5]:
# Links between cases
queryString = """
    prefix dcterm: <http://purl.org/dc/terms/> 
    prefix overheidrl: <http://linkeddata.overheid.nl/terms/>
    select ?link_id ?source ?target ?link_type ?link_type_label
    {
        ?source dcterm:type "Jurisprudentie".
        ?target dcterm:type "Jurisprudentie".
        ?link_id overheidrl:heeftLinktype ?link_type.
        ?link_id overheidrl:linktVan  ?source.
        ?link_id overheidrl:linktNaar  ?target.
        ?link_type rdfs:label ?link_type_label.
        ?source dcterm:creator ?creator_source. 
        ?target dcterm:creator ?creator_target. 
    }
"""

sparql.setQuery(queryString)
sparql.setReturnFormat(JSON)
ret = sparql.query()
result = ret.convert()
links = sparql_result_to_df(result)
links.shape

(17240, 5)

In [7]:
# Filter self-loops
links = links[~(links['source']==links['target'])]
links.shape

(16129, 5)

In [10]:
# Export nodes and links
caselawnet.utils.to_sigma_json(cases.to_dict(orient='records'),
                               links.to_dict(orient='records'),
                               'Hoge Raad',
                               filename=os.path.join('hr_simple.json')
                              )

In [11]:
cases.to_csv(os.path.join(outpath, 'hr_simple_nodes.csv'), index=False)
links.to_csv(os.path.join(outpath, 'hr_simple_links.csv'), index=False)

## Enrich the nodes

In [13]:
eclis = list(cases['ecli'])
nodes_enriched = []

In [33]:
eclis_enriched = set([c['ecli'] for c in nodes_enriched])

In [34]:
len(eclis_enriched)

68333

In [None]:
batch_size = 1000
for i in range(0, len(eclis), batch_size):
    print(i)
    batch = set(eclis[i:i+batch_size]).difference(eclis_enriched)
    nodes_enriched += caselawnet.enrich_eclis(batch, 
                                               rootpath='/media/sf_VBox_Shared/CaseLaw/OpenDataUitspraken/')

In [29]:
len(eclis), len(nodes_enriched)

(68339, 68333)

In [32]:
nodes_enriched_df = pd.DataFrame(nodes_enriched)
nodes_enriched_df.head()

Unnamed: 0,abstract,articles,count_annotation,count_version,creator,date,ecli,id,subject,title,year
0,"Artikel 3.92, lid 1, Wet IB 2001; inkomstenbel...",[],4,6,http://standaarden.overheid.nl/owms/terms/Hoge...,2010-01-15,ECLI:NL:HR:2010:BH9198,http://deeplink.rechtspraak.nl/uitspraak?id=EC...,http://psi.rechtspraak.nl/rechtsgebied#bestuur...,"ECLI:NL:HR:2010:BH9198 Hoge Raad , 15-01-2010 ...",2010
1,,[],1,1,http://standaarden.overheid.nl/owms/terms/Hoge...,1955-09-14,ECLI:NL:HR:1955:AY2555,http://deeplink.rechtspraak.nl/uitspraak?id=EC...,,,1955
2,,[],0,1,http://standaarden.overheid.nl/owms/terms/Hoge...,1956-02-15,ECLI:NL:HR:1956:AY2002,http://deeplink.rechtspraak.nl/uitspraak?id=EC...,,,1956
3,,[],1,1,http://standaarden.overheid.nl/owms/terms/Hoge...,1955-03-30,ECLI:NL:HR:1955:AY2453,http://deeplink.rechtspraak.nl/uitspraak?id=EC...,,,1955
4,,[],0,1,http://standaarden.overheid.nl/owms/terms/Hoge...,1954-05-26,ECLI:NL:HR:1954:AY2897,http://deeplink.rechtspraak.nl/uitspraak?id=EC...,,,1954


In [36]:
nodes_enriched_df.to_csv(os.path.join(outpath, 'hr_enriched_nodes_1.csv'), index=False)

In [37]:
cases_merged = cases.merge(nodes_enriched_df[['ecli', 'subject', 'creator', 'year', 'date', 'abstract']], on='ecli')

In [38]:
cases_merged['creator'] = [c['creator_x'] if c['creator_x'] else c['creator_y'].split('/')[-1].replace('_', ' ')
                           for i, c in cases_merged.iterrows()]

In [40]:
cases_merged = cases_merged[['id', 'ecli', 'title', 'subject', 'creator', 'year', 'date', 'abstract']]

In [41]:
cases_merged.head()

Unnamed: 0,id,ecli,title,subject,creator,year,date,abstract
0,http://linkeddata.overheid.nl/terms/jurisprude...,ECLI:NL:HR:1954:AY3407,"ECLI:NL:HR:1954:AY3407 - Hoge Raad, 10-11-1954...",,Hoge Raad der Nederlanden,1954,1954-11-10,
1,http://linkeddata.overheid.nl/terms/jurisprude...,ECLI:NL:HR:1954:AY3410,"ECLI:NL:HR:1954:AY3410 - Hoge Raad, 03-11-1954...",,Hoge Raad der Nederlanden,1954,1954-11-03,
2,http://linkeddata.overheid.nl/terms/jurisprude...,ECLI:NL:HR:1954:AY3414,"ECLI:NL:HR:1954:AY3414 - Hoge Raad, 03-11-1954...",,Hoge Raad der Nederlanden,1954,1954-11-03,
3,http://linkeddata.overheid.nl/terms/jurisprude...,ECLI:NL:HR:1954:AY4071,"ECLI:NL:HR:1954:AY4071 - Hoge Raad, 23-06-1954...",,Hoge Raad der Nederlanden,1954,1954-06-23,
4,http://linkeddata.overheid.nl/terms/jurisprude...,ECLI:NL:HR:1998:AA2296,"ECLI:NL:HR:1998:AA2296 - Hoge Raad, 10-08-1998...",http://psi.rechtspraak.nl/rechtsgebied#bestuur...,Hoge Raad der Nederlanden,1998,1998-08-10,-


In [42]:
cases_merged.to_csv(os.path.join(outpath, 'hr_enriched_nodes_2.csv'), index=False)

In [51]:
links = links.rename(columns={'link_id': 'id'})

In [None]:
cases_network, links_network = caselawnet.network_analysis.add_network_statistics(
    cases_merged.to_dict(orient='records'), links.to_dict(orient='records'))