In [1]:
# This introduces TUM Legal Tech's scrape of the HUDOC website, stored in a MongoDB on the chair's server. 
# This portion is restricted to the English language judgements.
# Website URL: https://hudoc.echr.coe.int/#{%22documentcollectionid2%22:[%22GRANDCHAMBER%22,%22CHAMBER%22]}

# written by Rashid Haddad, HiWi at TUM Legal Tech Chair

In [4]:
from pymongo import MongoClient
import re
import pandas as pd

# db connection setup
URI = "mongodb://%s:%s@f27se1.in.tum.de:27017/echr" % ("echr_read", "echr_read")# local
# URI = "mongodb://%s:%s@localhost:27017/echr" % ("echr_read", "echr_read") # server
client = MongoClient(URI)
database = client['echr']

# db setup
hejud = database["hejud"]

In [5]:
# Overview of all of the fields currently available. Some post-processing fields are optional, so not all documents have them.
['_id', 'originatingbody', 'ECHRRanking', 'appnoparts', 'representedby', 'sharepointid', 'typedescription', 'resolutionnumber', 'nonviolation', 'scl', 'organisations', 'documentcollectionid', 'judges', 'courts', 'conclusion', 'documentcollectionid2', 'meetingnumber', 'externalsources', 'doctypebranch', 'appno', 'respondent', 'application', 'importance', 'extractedappno', 'kpdateAsText', 'rulesofcourt', 'ecli', 'isplaceholder', 'Rank', 'violation', 'publishedby', 'judgementdate', 'dmdocnumber', 'sclappnos', 'separateopinion', 'doctype', 'languageisocode', 'introductiondate', 'reportdate', 'kpthesaurus', 'issue', 'applicability', 'languagenumber', 'docname', 'article', 'counter', 'kpdate', 'doctext_html', 'doctext_pdf', 'scl_array', 'doc_text', 'pdf', 'html', 'START', 'PROCEDURE', 'INTRODUCTION', 'PROCEDURE_AND_FACTS', 'FACTS', 'RELEVANT_LEGAL_FRAMEWORK', 'RELEVANT_DOMESTIC_LAW', 'LAW', 'PROCEEDINGS_BEFORE_THE_COMMISSION', 'FINAL_SUBMISSIONS_MADE_TO_THE_COURT_BY_THE_GOVERNMENT', 'FINAL_SUBMISSIONS_TO_THE_COURT', 'COURT_CONCLUSION', 'SEPARATE_OPINION', 'SUPPLEMENTARY_OBSERVATIONS', 'FULL_TEXT', 'sentences', 'FACTS_segmented', 'FACTS_segmented_no_headers', 'FACTS_segmented_new', 'FACTS_segmented_no_headers_new', 'PCR_FACTS', 'PCR_REMAINDER', 'PCR_CONCLUSION', 'PCR_REMAINDER_REMAINDER', 'SCL_EXTRACTIONS', 'articles_from_conclusion', 'articles_from_header_sentences', 'articles_merged', 'V_DOT_EXTRACTIONS', 'SCL_APPNO_DIRECT', 'APPNO_DIRECT', 'APPNOS_MERGE', 'APPNOS_MERGE2', 'split_votes']

['_id',
 'originatingbody',
 'ECHRRanking',
 'appnoparts',
 'representedby',
 'sharepointid',
 'typedescription',
 'resolutionnumber',
 'nonviolation',
 'scl',
 'organisations',
 'documentcollectionid',
 'judges',
 'courts',
 'conclusion',
 'documentcollectionid2',
 'meetingnumber',
 'externalsources',
 'doctypebranch',
 'appno',
 'respondent',
 'application',
 'importance',
 'extractedappno',
 'kpdateAsText',
 'rulesofcourt',
 'ecli',
 'isplaceholder',
 'Rank',
 'violation',
 'publishedby',
 'judgementdate',
 'dmdocnumber',
 'sclappnos',
 'separateopinion',
 'doctype',
 'languageisocode',
 'introductiondate',
 'reportdate',
 'kpthesaurus',
 'issue',
 'applicability',
 'languagenumber',
 'docname',
 'article',
 'counter',
 'kpdate',
 'doctext_html',
 'doctext_pdf',
 'scl_array',
 'doc_text',
 'pdf',
 'html',
 'START',
 'PROCEDURE',
 'INTRODUCTION',
 'PROCEDURE_AND_FACTS',
 'FACTS',
 'RELEVANT_LEGAL_FRAMEWORK',
 'RELEVANT_DOMESTIC_LAW',
 'LAW',
 'PROCEEDINGS_BEFORE_THE_COMMISSION',
 'FI

In [7]:
# Getting an example document. NOTE (!!!): Because not all documents include all fields, might need to rerun this to find
# a useful document.
# doc = hejud.find_one() # one way, doesn't randomly sample

# Sample at random:
docs = hejud.aggregate([{ '$sample': { 'size': 2 } }])
doc = docs.next()

ServerSelectionTimeoutError: f27se1.in.tum.de:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 66c85f0a9d95df36e414adf0, topology_type: Unknown, servers: [<ServerDescription ('f27se1.in.tum.de', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('f27se1.in.tum.de:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [17]:
# Note: Earlier ones are original from the database. Fields post 'START' field have been added in postprocessing. Rashid's work begins
# at field 'sentences' and those are the most accurate.
for doc in docs:
    print(doc)

{'_id': '001-192223', 'originatingbody': '29', 'ECHRRanking': 1081, 'appnoparts': ['14486', '07'], 'representedby': 'OVDIYENKO H.V. ; OKHOTNIKOVA N.G.', 'sharepointid': '487304', 'typedescription': '15', 'resolutionnumber': '', 'nonviolation': '34', 'scl': '', 'organisations': 'European Committee for the Prevention of Torture;ECHR', 'documentcollectionid': ['CASELAW', 'JUDGMENTS', 'COMMITTEE', 'ENG'], 'judges': ['André Potocki', 'Mārtiņš Mits'], 'courts': 'Supreme Court', 'conclusion': 'Violation of Article 3 - Prohibition of torture (Article 3 - Degrading treatment) (Substantive aspect);Violation of Article 3 - Prohibition of torture (Article 3 - Degrading treatment) (Substantive aspect);Violation of Article 6+6-1 - Right to a fair trial (Article 6-3-c - Defence through legal assistance) (Article 6 - Right to a fair trial;Criminal proceedings;Article 6-1 - Fair hearing);Violation of Article 34 - Individual applications (Article 34 - Hinder the exercise of the right of application);No 

In [14]:
# From the original metadata, the most important fields when working with ECHR data are:

print(doc['_id'])
print(doc['appno'])
print(doc['docname'])

001-57785
['13770/88']
CASE OF MEGYERI v. GERMANY


In [20]:
# The db is structured into docs which mirror the pages on the HUDOC website. A case has a unique application number,
# but can be linked to multiple such documents if there were revisions.

In [21]:
# I have parsed the html carefully and the resulting fields are the fairest compromise between granularity and
# error rate. Unfortunately, the documents had deceptively similar, but not perfectly consistent structure. I
# handled as many exceptions as possible.

# Overall, the lowest common denominator was to preserve sentence level splits, where a sentence is most commonly a paragraph
# from the case. A paragraph typically begins with a number, other than the document and section headers. Note that subparagraphs
# can include their own numbering systems.

In [12]:
# In the event that you would like to draw upon the original html, refer to:
a = doc['html']
print(a)

<


In [6]:
# Most likely, you would like to access one abstraction level higher: the sentence level plain text.
# This is preserved as a list of strings.

print(doc['sentences'])

['SECOND SECTION', 'CASE OF RASPOPOVIĆ AND OTHERS v. MONTENEGRO', '( Application no. 58942/11 and 2 others -', 'see appended list )', 'JUDGMENT', 'STRASBOURG', '26 March 2020', 'This judgment is final but it may be subject to editorial revision.', 'In the case of Raspopović and Others v. Montenegro,', 'The European Court of Human Rights ( Second Section ), sitting as a Committee composed of:', 'Arnfinn Bårdsen, President, Ivana Jelić, Darian Pavli, judges, and Liv Tigerstedt, Acting Deputy Section Registrar,', 'Having deliberated in private on 5 March 2020,', 'Delivers the following judgment, which was adopted on that date:', 'PROCEDURE', '1. The case originated in applications against Montenegro lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) on the various dates indicated in the appended table.', '2. The Montenegrin Government (“the Government”) were given notice of the applications.', 'THE FACTS'

In [7]:
# The advantage of the ECHR judgements is that they have distinct sections for facts, legal reasoning, etc.
# The breakpoints are somewhat regular (common headers), and I handled most exceptions. The following fields encode lists
# of strings for each section:

# Facts
print(doc['PCR_FACTS'])

# Law
print(doc['PCR_REMAINDER_REMAINDER'])

# Conclusion
print(doc['PCR_CONCLUSION'])

['3. The list of applicants and the relevant details of the applications are set out in the appended table.', '4. The applicants complained of the excessive length of civil proceedings.']
['THE LAW', 'JOINDER OF THE APPLICATIONS', '5. Having regard to the similar subject matter of the applications, the Court finds it appropriate to examine them jointly in a single judgment.', 'ALLEGED VIOLATION OF ARTICLE 6 § 1 OF THE CONVENTION', '6. The applicants complained that the length of the civil proceedings in question had been incompatible with the “reasonable time” requirement. They relied on Article 6 § 1 of the Convention, which reads as follows:', 'Article 6 § 1', '“In the determination of his civil rights and obligations ... everyone is entitled to a ... hearing within a reasonable time by [a] ... tribunal ...”', '7. The Court reiterates that the reasonableness of the length of proceedings must be assessed in the light of the circumstances of the case and with reference to the following

In [8]:
# ECHR judgements cite prior cases if they are relevant. The citation structure is only somewhat consistent.
# A lot of effort was invested to parse a citation graph accross the documents. The precision and recall are 0.86 and 0.89 respectively.
# The various mining strategies and their precursors are stored in the following fields:

# Strasbourg case law citations (the scrape included these citations in a metadata page, which appear to be manually extracted from the doc.
# Some include appnos, which we stored in APPNO_DIRECT. Others required more sophisticated, error prone matching.
# print(doc['SCL_EXTRACTIONS'])
# print(doc['SCL_APPNO_DIRECT'])

# Citations from the text, whenever v. occured. Not all citations included a defendant, so v. was not a sufficient indicator of a citation.
# print(doc['V_DOT_EXTRACTIONS'])

# Citations where the appnos are mentioned. Also not consistently available. Occasionally erroneous when referring to a different case
# numbering system, like from a national court (mostly dealt with).
# print(doc['APPNO_DIRECT'])

# A merged set of these strategies. This is the final list of cases cited by a given case.
print(doc['APPNOS_MERGE2'])


['30979/96', '49320/07']


In [9]:
# Judgements are made at the article level for each article that is believed to be violated.
# The list of such alleged article violations has been mined and stored at:
print(doc['articles_merged'])

# The set originates from to mining strategies of different sections. For the components, see:
# print(doc['articles_from_conclusion'])
# print(doc['articles_from_header_sentences'])

['6']


In [10]:
# Judgements are made by a vote of multiple judges for each article. The outcome of these votes has been mined and stored at:
print(doc['split_votes'])

[]
