Skip to content

Commit

Permalink
activate the open data parsing for 15th legislature doslegs
Browse files Browse the repository at this point in the history
fixes #7
  • Loading branch information
mdamien committed Jun 5, 2018
1 parent 22d293a commit 4052fd6
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 132 deletions.
5 changes: 5 additions & 0 deletions anpy/dossier.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
AN_BASE_URL = 'http://www.assemblee-nationale.fr'


def get_legislature(url_an):
legislature_match = re.search(r"\.fr/(dyn/)?(\d+)/", url_an)
return int(legislature_match.group(2))


class InvalidResponseException(Exception):
pass

Expand Down
159 changes: 103 additions & 56 deletions anpy/dossier_from_opendata.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
"""
Experiment to use the JSON formatted data provided byt the AN for the doslegs
Experiment to use the JSON formatted data provided by the AN for the doslegs
- step 1: download the data needed here:
http://data.assemblee-nationale.fr/static/openData/repository/15/loi/dossiers_legislatifs/Dossiers_Legislatifs_XV.json.zip
- step 2: you can parse a dosleg by doing
python dossier_from_opendata.py dossiers.json <dosleg_url>
python dossier_from_opendata.py <dosleg_url>
"""

import sys
import json
import zipfile
import io

from lawfactory_utils.urls import download, enable_requests_cache, clean_url
import lawfactory_utils.urls


def _log(*args):
print(*args, file=sys.stderr)
from anpy.dossier import get_legislature


def yield_leafs(etape, path=None):
Expand Down Expand Up @@ -43,11 +41,50 @@ def test_status(url):
return resp


def parse(url, dossiers_json):
docs = {
doc["uid"]: doc
for doc in dossiers_json["export"]["textesLegislatifs"]["document"]
def download_open_data_doslegs(legislature):
files = {
15: (
"Dossiers_Legislatifs_XV.json",
"http://data.assemblee-nationale.fr/static/openData/repository/15/loi/dossiers_legislatifs/Dossiers_Legislatifs_XV.json.zip",
),
14: (
"Dossiers_Legislatifs_XIV.json",
"http://data.assemblee-nationale.fr/static/openData/repository/14/loi/dossiers_legislatifs/Dossiers_Legislatifs_XIV.json.zip",
),
}
#  TODO: remove this hack when we are able to cache the zip files
file, file_url = files[legislature]

CACHE_ENABLED = lawfactory_utils.urls.CACHE_ENABLED
lawfactory_utils.urls.CACHE_ENABLED = False
doslegs_resp = download(file_url)
lawfactory_utils.urls.CACHE_ENABLED = CACHE_ENABLED

doslegs_zip = zipfile.ZipFile(io.BytesIO(doslegs_resp.content))
DATA = json.loads(doslegs_zip.open(file).read().decode("utf-8"))

return DATA


def parse(url, verbose=True, logfile=sys.stderr, cached_opendata_an={}):
if not verbose:

def _log(*x):
return None

else:

def _log(*args):
nonlocal logfile
print(*args, file=logfile)

legislature = get_legislature(url)
if legislature and legislature in cached_opendata_an:
dossiers_json = cached_opendata_an[legislature]
else:
dossiers_json = download_open_data_doslegs(get_legislature(url))

docs = {doc["uid"]: doc for doc in dossiers_json["export"]["textesLegislatifs"]["document"]}

for dossier in dossiers_json["export"]["dossiersLegislatifs"]["dossier"]:
dossier = dossier["dossierParlementaire"]
Expand All @@ -62,28 +99,27 @@ def parse(url, dossiers_json):

data = {}
data["urgence"] = False
data["url_dossier_senat"] = dossier["titreDossier"]["senatChemin"]
url_senat = dossier["titreDossier"]["senatChemin"]
if url_senat:
data["url_dossier_senat"] = url_senat
data["long_title"] = dossier["titreDossier"]["titre"]
data["url_dossier_assemblee"] = url
data["assemblee_legislature"] = int(dossier["legislature"])
data["assemblee_slug"] = dossier["titreDossier"]["titreChemin"]
data["assemblee_id"] = "%s-%s" % (
dossier["legislature"],
data["assemblee_slug"],
)
data["assemblee_id"] = "%s-%s" % (dossier["legislature"], data["assemblee_slug"])

data["steps"] = []
for etape in to_arr(dossier["actesLegislatifs"]["acteLegislatif"]):
for path, sous_etape in yield_leafs(etape):
if sous_etape["@xsi:type"] in ("EtudeImpact_Type",):
if sous_etape["@xsi:type"] in ("EtudeImpact_Type", "DepotAvisConseilEtat_Type"):
continue

step = {"date": sous_etape.get("dateActe").split("T")[0]}

if sous_etape["@xsi:type"] == "ProcedureAccelere_Type":
data["urgence"] = True
elif sous_etape["@xsi:type"] == "Promulgation_Type":
url = clean_url(sous_etape["urlLegifrance"])
url = clean_url(sous_etape.get("urlLegifrance") or sous_etape["infoJO"]["urlLegifrance"])
data["url_jo"] = url
data["end"] = step["date"]

Expand All @@ -100,13 +136,14 @@ def parse(url, dossiers_json):

if "textesAssocies" in sous_etape:
# TODO review
sous_etape["texteAssocie"] = sous_etape["textesAssocies"][
"texteAssocie"
]["refTexteAssocie"]
sous_etape["texteAssocie"] = to_arr(sous_etape["textesAssocies"]["texteAssocie"])[0]["refTexteAssocie"]

if "texteAdopte" in sous_etape or "texteAssocie" in sous_etape:
code = sous_etape.get("codeActe")

if "AVIS-RAPPORT" in code:
continue

if code.startswith("AN"):
step["institution"] = "assemblee"
elif code.startswith("SN"):
Expand All @@ -132,7 +169,7 @@ def parse(url, dossiers_json):
elif "CMP-" in code:
step["stage"] = "CMP"
if "RAPPORT-AN" in code:
step["institution"] = "assemblee"
step["institution"] = "CMP" # TODO: still CMP commission left
elif "RAPPORT-SN" in code:
step["institution"] = "senat"
continue #  TODO: add link to CMP commission step
Expand All @@ -141,56 +178,66 @@ def parse(url, dossiers_json):

# step['xsi-type'] = sous_etape.get('@xsi:type')
# step['code'] = sous_etape.get('codeActe')
step["id_step_opendata"] = sous_etape["uid"]

id_text = sous_etape.get("texteAdopte", sous_etape["texteAssocie"])
if id_text:
step["id_text_opendata"] = id_text
if "proposal_type" not in data:
if id_text.startswith("PRJL"):
data["proposal_type"] = "PJL"
elif id_text.startswith("PION"):
data["proposal_type"] = "PPL"

if step.get("institution") == "assemblee" and id_text:
doc = {}
if id_text in docs:
doc = docs[id_text]
else:
_log(" - ERROR missing text", id_text)

url = None
if step.get("institution") == "assemblee":
text_no = id_text[-4:]

url = None
if step.get("step") == "commission":
if step.get("stage") == "CMP":
url = (
"http://www.assemblee-nationale.fr/{}/rapport/{}.asp"
)
url = "http://www.assemblee-nationale.fr/{}/rapport/{}.asp"
else:
url = (
"http://www.assemblee-nationale.fr/{}/ta-commission/r{}-a0.asp"
)
url = "http://www.assemblee-nationale.fr/{}/ta-commission/r{}-a0.asp"
elif step.get("step") == "depot":
if data["proposal_type"] == "PJL":
url = (
"http://www.assemblee-nationale.fr/{}/projets/pl{}.asp"
)
url = "http://www.assemblee-nationale.fr/{}/projets/pl{}.asp"
else:
url = (
"http://www.assemblee-nationale.fr/{}/propositions/pion{}.asp"
)
url = "http://www.assemblee-nationale.fr/{}/propositions/pion{}.asp"
elif step.get("step") == "hemicycle":
url = "http://www.assemblee-nationale.fr/{}/ta/ta{}.asp"

if url:
doc = {}
if id_text in docs:
doc = docs[id_text]
else:
_log(" - ERROR missing text", id_text)
legislature = doc.get(
"legislature", data["assemblee_legislature"]
)
url = url.format(legislature, text_no)
step["source_url"] = url

if not url or not test_status(url):
_log(" - INVALID text url -", url, step)
_log()
#  TODO: url senat
"""
if step.get("institution") == "senat":
# TODO guess "legislature" because it's missing
text_no = id_text[-3:]
ppl_or_pjl = data["proposal_type"].lower()
if step.get("step") == "commission":
url = "https://www.senat.fr/leg/%s{}-{}.html" % ppl_or_pjl
elif step.get("step") == "depot":
url = "https://www.senat.fr/leg/%s{}-{}.html" % ppl_or_pjl
elif step.get("step") == "hemicycle":
url = "https://www.senat.fr/leg/tas{}-{}.html"
"""

if url:
legislature = doc.get("legislature", data["assemblee_legislature"])
url = url.format(legislature, text_no)
step["source_url"] = url

"""
if not url or not test_status(url):
_log(" - INVALID text url -", url, step)
_log()
"""

#  TODO: url CMP + other url
data["steps"].append(step)

Expand All @@ -199,11 +246,11 @@ def parse(url, dossiers_json):
data["beggining"] = data["steps"][0]["date"]

return data
return []


if __name__ == "__main__":
enable_requests_cache()
dossiers_json = json.load(open(sys.argv[1])) # TODO: automate the download
url = sys.argv[2]
data = parse(url, dossiers_json)
url = sys.argv[1]
data = parse(url)
print(json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False))
47 changes: 29 additions & 18 deletions anpy/dossier_like_senapy.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
# re-write of the dosleg parser to have
# the same output as senapy

import json
import sys
import re
from urllib.parse import urljoin

import requests
import dateparser
from bs4 import BeautifulSoup

from lawfactory_utils.urls import clean_url, download

from anpy.dossier_from_opendata import parse as opendata_parse
from anpy.dossier import get_legislature


def format_date(date):
parsed = dateparser.parse(date, languages=['fr'])
Expand Down Expand Up @@ -55,7 +56,7 @@ def merge_previous_works_an(older_dos, dos):
return dos


def parse(html, url_an=None, verbose=True, logfile=sys.stderr, nth_dos_in_page=0, parse_previous_works=True, parse_next_works=True):
def historic_doslegs_parse(html, url_an=None, verbose=True, logfile=sys.stderr, nth_dos_in_page=0, parse_previous_works=True, parse_next_works=True):
"""
Parse an AN dosleg like http://www.assemblee-nationale.fr/13/dossiers/accord_Montenegro_mobilite_jeunes.asp
Expand Down Expand Up @@ -346,45 +347,55 @@ def get_last_step():
if 'previous_works' in data and parse_previous_works:
log_warning('MERGING WITH PREVIOUS WORKS', data['previous_works'])
resp = download_an(data['previous_works'])
prev_data = parse(resp.text, data['previous_works'], verbose=verbose, nth_dos_in_page=nth_dos_in_page, parse_next_works=False)
prev_data = historic_doslegs_parse(
resp.text, data['previous_works'],
logfile=logfile, verbose=verbose,
nth_dos_in_page=nth_dos_in_page, parse_next_works=False)
if prev_data:
prev_data = prev_data[nth_dos_in_page] if len(prev_data) > 1 else prev_data[0]
data = merge_previous_works_an(prev_data, data)
else:
log_warning('INVALID PREVIOUS WORKS', data['previous_works'])

# is this part of a dosleg previous works ?
if 'assemblee_legislature' in data and parse_next_works:
next_legislature = data['assemblee_legislature'] + 1 if 'assemblee_legislature' in data else 9999
if parse_next_works and next_legislature < 15:
# TODO: parse 15th legislature from open data if it exists
resp = download_an(url_an.replace('/%d/' % data['assemblee_legislature'], '/%d/' % (data['assemblee_legislature'] + 1)))
if resp.status_code == 200:
recent_data = parse(resp.text, resp.url, verbose=verbose, nth_dos_in_page=nth_dos_in_page, parse_previous_works=False)
recent_data = historic_doslegs_parse(
resp.text, resp.url,
logfile=logfile, verbose=verbose,
nth_dos_in_page=nth_dos_in_page, parse_previous_works=False)
if recent_data:
log_warning('FOUND MORE RECENT WORKS', resp.url)
recent_data = recent_data[nth_dos_in_page] if len(recent_data) > 1 else recent_data[0]
data = merge_previous_works_an(data, recent_data)

if another_dosleg_inside:
others = parse(another_dosleg_inside, url_an, verbose=verbose, nth_dos_in_page=nth_dos_in_page+1)
others = historic_doslegs_parse(another_dosleg_inside, url_an, logfile=logfile, verbose=verbose, nth_dos_in_page=nth_dos_in_page+1)
if others:
return [data] + others
return [data]


if __name__ == '__main__':
url = sys.argv[1]
if url.startswith('http'):
html = requests.get(url).text
data = parse(html, url)
def parse(url_an, verbose=True, logfile=sys.stderr, cached_opendata_an={}):
legislature = get_legislature(url_an)
if legislature > 14 and '/dyn/' not in url_an:
url_an = url_an.replace('.fr', '.fr/dyn').replace('.asp', '')

if '/dyn/' in url_an:
parsed = opendata_parse(url_an, verbose=verbose, logfile=logfile, cached_opendata_an=cached_opendata_an)
if parsed:
return [parsed]
return
else:
html = open(url).read()
url = html.split('-- URL=')[-1].split('-->')[0].strip()
data = parse(html, url)
print(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True))
resp = download_an(url_an)
return historic_doslegs_parse(resp.text, url_an, verbose=verbose, logfile=logfile)


"""
Cas non-gérés:
Cas non-gérés (anciens dossiers):
- renvois en commision: http://www.assemblee-nationale.fr/14/dossiers/interdiction_prescription_acquisitive_voies_rurales.asp
- senat ppl manquant: http://www.assemblee-nationale.fr/13/dossiers/comite_poids_mesures.asp
- windows-1252 encoding: http://www.assemblee-nationale.fr/15/dossiers/responsabilite_financiere_dirigeants_benevoles_associations.asp
"""
Loading

0 comments on commit 4052fd6

Please sign in to comment.