activate the open data parsing for 15th legislature doslegs

fixes #7
regardscitoyens · Jun 5, 2018 · 4052fd6 · 4052fd6
1 parent 22d293a
commit 4052fd6
Show file tree

Hide file tree

Showing 5 changed files with 144 additions and 132 deletions.
diff --git a/anpy/dossier.py b/anpy/dossier.py
@@ -17,6 +17,11 @@
 AN_BASE_URL = 'http://www.assemblee-nationale.fr'
 
 
+def get_legislature(url_an):
+    legislature_match = re.search(r"\.fr/(dyn/)?(\d+)/", url_an)
+    return int(legislature_match.group(2))
+
+
 class InvalidResponseException(Exception):
     pass
 

diff --git a/anpy/dossier_from_opendata.py b/anpy/dossier_from_opendata.py
@@ -1,20 +1,18 @@
 """
-Experiment to use the JSON formatted data provided byt the AN for the doslegs
+Experiment to use the JSON formatted data provided by the AN for the doslegs
 
-  - step 1: download the data needed here:
-  http://data.assemblee-nationale.fr/static/openData/repository/15/loi/dossiers_legislatifs/Dossiers_Legislatifs_XV.json.zip
-  - step 2: you can parse a dosleg by doing
-  python dossier_from_opendata.py dossiers.json <dosleg_url>
+  python dossier_from_opendata.py <dosleg_url>
 """
 
 import sys
 import json
+import zipfile
+import io
 
 from lawfactory_utils.urls import download, enable_requests_cache, clean_url
+import lawfactory_utils.urls
 
-
-def _log(*args):
-    print(*args, file=sys.stderr)
+from anpy.dossier import get_legislature
 
 
 def yield_leafs(etape, path=None):
@@ -43,11 +41,50 @@ def test_status(url):
     return resp
 
 
-def parse(url, dossiers_json):
-    docs = {
-        doc["uid"]: doc
-        for doc in dossiers_json["export"]["textesLegislatifs"]["document"]
+def download_open_data_doslegs(legislature):
+    files = {
+        15: (
+            "Dossiers_Legislatifs_XV.json",
+            "http://data.assemblee-nationale.fr/static/openData/repository/15/loi/dossiers_legislatifs/Dossiers_Legislatifs_XV.json.zip",
+        ),
+        14: (
+            "Dossiers_Legislatifs_XIV.json",
+            "http://data.assemblee-nationale.fr/static/openData/repository/14/loi/dossiers_legislatifs/Dossiers_Legislatifs_XIV.json.zip",
+        ),
     }
+    #  TODO: remove this hack when we are able to cache the zip files
+    file, file_url = files[legislature]
+
+    CACHE_ENABLED = lawfactory_utils.urls.CACHE_ENABLED
+    lawfactory_utils.urls.CACHE_ENABLED = False
+    doslegs_resp = download(file_url)
+    lawfactory_utils.urls.CACHE_ENABLED = CACHE_ENABLED
+
+    doslegs_zip = zipfile.ZipFile(io.BytesIO(doslegs_resp.content))
+    DATA = json.loads(doslegs_zip.open(file).read().decode("utf-8"))
+
+    return DATA
+
+
+def parse(url, verbose=True, logfile=sys.stderr, cached_opendata_an={}):
+    if not verbose:
+
+        def _log(*x):
+            return None
+
+    else:
+
+        def _log(*args):
+            nonlocal logfile
+            print(*args, file=logfile)
+
+    legislature = get_legislature(url)
+    if legislature and legislature in cached_opendata_an:
+        dossiers_json = cached_opendata_an[legislature]
+    else:
+        dossiers_json = download_open_data_doslegs(get_legislature(url))
+
+    docs = {doc["uid"]: doc for doc in dossiers_json["export"]["textesLegislatifs"]["document"]}
 
     for dossier in dossiers_json["export"]["dossiersLegislatifs"]["dossier"]:
         dossier = dossier["dossierParlementaire"]
@@ -62,28 +99,27 @@ def parse(url, dossiers_json):
 
         data = {}
         data["urgence"] = False
-        data["url_dossier_senat"] = dossier["titreDossier"]["senatChemin"]
+        url_senat = dossier["titreDossier"]["senatChemin"]
+        if url_senat:
+            data["url_dossier_senat"] = url_senat
         data["long_title"] = dossier["titreDossier"]["titre"]
         data["url_dossier_assemblee"] = url
         data["assemblee_legislature"] = int(dossier["legislature"])
         data["assemblee_slug"] = dossier["titreDossier"]["titreChemin"]
-        data["assemblee_id"] = "%s-%s" % (
-            dossier["legislature"],
-            data["assemblee_slug"],
-        )
+        data["assemblee_id"] = "%s-%s" % (dossier["legislature"], data["assemblee_slug"])
 
         data["steps"] = []
         for etape in to_arr(dossier["actesLegislatifs"]["acteLegislatif"]):
             for path, sous_etape in yield_leafs(etape):
-                if sous_etape["@xsi:type"] in ("EtudeImpact_Type",):
+                if sous_etape["@xsi:type"] in ("EtudeImpact_Type", "DepotAvisConseilEtat_Type"):
                     continue
 
                 step = {"date": sous_etape.get("dateActe").split("T")[0]}
 
                 if sous_etape["@xsi:type"] == "ProcedureAccelere_Type":
                     data["urgence"] = True
                 elif sous_etape["@xsi:type"] == "Promulgation_Type":
-                    url = clean_url(sous_etape["urlLegifrance"])
+                    url = clean_url(sous_etape.get("urlLegifrance") or sous_etape["infoJO"]["urlLegifrance"])
                     data["url_jo"] = url
                     data["end"] = step["date"]
 
@@ -100,13 +136,14 @@ def parse(url, dossiers_json):
 
                 if "textesAssocies" in sous_etape:
                     # TODO review
-                    sous_etape["texteAssocie"] = sous_etape["textesAssocies"][
-                        "texteAssocie"
-                    ]["refTexteAssocie"]
+                    sous_etape["texteAssocie"] = to_arr(sous_etape["textesAssocies"]["texteAssocie"])[0]["refTexteAssocie"]
 
                 if "texteAdopte" in sous_etape or "texteAssocie" in sous_etape:
                     code = sous_etape.get("codeActe")
 
+                    if "AVIS-RAPPORT" in code:
+                        continue
+
                     if code.startswith("AN"):
                         step["institution"] = "assemblee"
                     elif code.startswith("SN"):
@@ -132,7 +169,7 @@ def parse(url, dossiers_json):
                     elif "CMP-" in code:
                         step["stage"] = "CMP"
                         if "RAPPORT-AN" in code:
-                            step["institution"] = "assemblee"
+                            step["institution"] = "CMP"  # TODO: still CMP commission left
                         elif "RAPPORT-SN" in code:
                             step["institution"] = "senat"
                             continue  #  TODO: add link to CMP commission step
@@ -141,56 +178,66 @@ def parse(url, dossiers_json):
 
                     # step['xsi-type'] = sous_etape.get('@xsi:type')
                     # step['code'] = sous_etape.get('codeActe')
+                    step["id_step_opendata"] = sous_etape["uid"]
 
                     id_text = sous_etape.get("texteAdopte", sous_etape["texteAssocie"])
                     if id_text:
+                        step["id_text_opendata"] = id_text
                         if "proposal_type" not in data:
                             if id_text.startswith("PRJL"):
                                 data["proposal_type"] = "PJL"
                             elif id_text.startswith("PION"):
                                 data["proposal_type"] = "PPL"
 
-                        if step.get("institution") == "assemblee" and id_text:
+                        doc = {}
+                        if id_text in docs:
+                            doc = docs[id_text]
+                        else:
+                            _log("  - ERROR missing text", id_text)
+
+                        url = None
+                        if step.get("institution") == "assemblee":
                             text_no = id_text[-4:]
 
-                            url = None
                             if step.get("step") == "commission":
                                 if step.get("stage") == "CMP":
-                                    url = (
-                                        "http://www.assemblee-nationale.fr/{}/rapport/{}.asp"
-                                    )
+                                    url = "http://www.assemblee-nationale.fr/{}/rapport/{}.asp"
                                 else:
-                                    url = (
-                                        "http://www.assemblee-nationale.fr/{}/ta-commission/r{}-a0.asp"
-                                    )
+                                    url = "http://www.assemblee-nationale.fr/{}/ta-commission/r{}-a0.asp"
                             elif step.get("step") == "depot":
                                 if data["proposal_type"] == "PJL":
-                                    url = (
-                                        "http://www.assemblee-nationale.fr/{}/projets/pl{}.asp"
-                                    )
+                                    url = "http://www.assemblee-nationale.fr/{}/projets/pl{}.asp"
                                 else:
-                                    url = (
-                                        "http://www.assemblee-nationale.fr/{}/propositions/pion{}.asp"
-                                    )
+                                    url = "http://www.assemblee-nationale.fr/{}/propositions/pion{}.asp"
                             elif step.get("step") == "hemicycle":
                                 url = "http://www.assemblee-nationale.fr/{}/ta/ta{}.asp"
 
-                            if url:
-                                doc = {}
-                                if id_text in docs:
-                                    doc = docs[id_text]
-                                else:
-                                    _log("  - ERROR missing text", id_text)
-                                legislature = doc.get(
-                                    "legislature", data["assemblee_legislature"]
-                                )
-                                url = url.format(legislature, text_no)
-                                step["source_url"] = url
-
-                            if not url or not test_status(url):
-                                _log("  - INVALID text url -", url, step)
-                                _log()
-                        #  TODO: url senat
+                        """
+                        if step.get("institution") == "senat":
+                            # TODO guess "legislature" because it's missing
+                            text_no = id_text[-3:]
+
+                            ppl_or_pjl = data["proposal_type"].lower()
+
+                            if step.get("step") == "commission":
+                                url = "https://www.senat.fr/leg/%s{}-{}.html" % ppl_or_pjl
+                            elif step.get("step") == "depot":
+                                url = "https://www.senat.fr/leg/%s{}-{}.html" % ppl_or_pjl
+                            elif step.get("step") == "hemicycle":
+                                url = "https://www.senat.fr/leg/tas{}-{}.html"
+                        """
+
+                        if url:
+                            legislature = doc.get("legislature", data["assemblee_legislature"])
+                            url = url.format(legislature, text_no)
+                            step["source_url"] = url
+
+                        """
+                        if not url or not test_status(url):
+                            _log("  - INVALID text url -", url, step)
+                            _log()
+                        """
+
                         #  TODO: url CMP + other url
                     data["steps"].append(step)
 
@@ -199,11 +246,11 @@ def parse(url, dossiers_json):
         data["beggining"] = data["steps"][0]["date"]
 
         return data
+    return []
 
 
 if __name__ == "__main__":
     enable_requests_cache()
-    dossiers_json = json.load(open(sys.argv[1]))  # TODO: automate the download
-    url = sys.argv[2]
-    data = parse(url, dossiers_json)
+    url = sys.argv[1]
+    data = parse(url)
     print(json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False))
diff --git a/anpy/dossier_like_senapy.py b/anpy/dossier_like_senapy.py
@@ -1,17 +1,18 @@
 # re-write of the dosleg parser to have
 # the same output as senapy
 
-import json
 import sys
 import re
 from urllib.parse import urljoin
 
-import requests
 import dateparser
 from bs4 import BeautifulSoup
 
 from lawfactory_utils.urls import clean_url, download
 
+from anpy.dossier_from_opendata import parse as opendata_parse
+from anpy.dossier import get_legislature
+
 
 def format_date(date):
     parsed = dateparser.parse(date, languages=['fr'])
@@ -55,7 +56,7 @@ def merge_previous_works_an(older_dos, dos):
     return dos
 
 
-def parse(html, url_an=None, verbose=True, logfile=sys.stderr, nth_dos_in_page=0, parse_previous_works=True, parse_next_works=True):
+def historic_doslegs_parse(html, url_an=None, verbose=True, logfile=sys.stderr, nth_dos_in_page=0, parse_previous_works=True, parse_next_works=True):
     """
     Parse an AN dosleg like http://www.assemblee-nationale.fr/13/dossiers/accord_Montenegro_mobilite_jeunes.asp
 
@@ -346,45 +347,55 @@ def get_last_step():
     if 'previous_works' in data and parse_previous_works:
         log_warning('MERGING WITH PREVIOUS WORKS', data['previous_works'])
         resp = download_an(data['previous_works'])
-        prev_data = parse(resp.text, data['previous_works'], verbose=verbose, nth_dos_in_page=nth_dos_in_page, parse_next_works=False)
+        prev_data = historic_doslegs_parse(
+            resp.text, data['previous_works'],
+            logfile=logfile, verbose=verbose,
+            nth_dos_in_page=nth_dos_in_page, parse_next_works=False)
         if prev_data:
             prev_data = prev_data[nth_dos_in_page] if len(prev_data) > 1 else prev_data[0]
             data = merge_previous_works_an(prev_data, data)
         else:
             log_warning('INVALID PREVIOUS WORKS', data['previous_works'])
 
     # is this part of a dosleg previous works ?
-    if 'assemblee_legislature' in data and parse_next_works:
+    next_legislature = data['assemblee_legislature'] + 1 if 'assemblee_legislature' in data else 9999
+    if parse_next_works and next_legislature < 15:
+        #  TODO: parse 15th legislature from open data if it exists
         resp = download_an(url_an.replace('/%d/' % data['assemblee_legislature'], '/%d/' % (data['assemblee_legislature'] + 1)))
         if resp.status_code == 200:
-            recent_data = parse(resp.text, resp.url, verbose=verbose, nth_dos_in_page=nth_dos_in_page, parse_previous_works=False)
+            recent_data = historic_doslegs_parse(
+                resp.text, resp.url,
+                logfile=logfile, verbose=verbose,
+                nth_dos_in_page=nth_dos_in_page, parse_previous_works=False)
             if recent_data:
                 log_warning('FOUND MORE RECENT WORKS', resp.url)
                 recent_data = recent_data[nth_dos_in_page] if len(recent_data) > 1 else recent_data[0]
                 data = merge_previous_works_an(data, recent_data)
 
     if another_dosleg_inside:
-        others = parse(another_dosleg_inside, url_an, verbose=verbose, nth_dos_in_page=nth_dos_in_page+1)
+        others = historic_doslegs_parse(another_dosleg_inside, url_an, logfile=logfile, verbose=verbose, nth_dos_in_page=nth_dos_in_page+1)
         if others:
             return [data] + others
     return [data]
 
 
-if __name__ == '__main__':
-    url = sys.argv[1]
-    if url.startswith('http'):
-        html = requests.get(url).text
-        data = parse(html, url)
+def parse(url_an, verbose=True, logfile=sys.stderr, cached_opendata_an={}):
+    legislature = get_legislature(url_an)
+    if legislature > 14 and '/dyn/' not in url_an:
+        url_an = url_an.replace('.fr', '.fr/dyn').replace('.asp', '')
+
+    if '/dyn/' in url_an:
+        parsed = opendata_parse(url_an, verbose=verbose, logfile=logfile, cached_opendata_an=cached_opendata_an)
+        if parsed:
+            return [parsed]
+        return
     else:
-        html = open(url).read()
-        url = html.split('-- URL=')[-1].split('-->')[0].strip()
-        data = parse(html, url)
-    print(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True))
+        resp = download_an(url_an)
+        return historic_doslegs_parse(resp.text, url_an, verbose=verbose, logfile=logfile)
 
 
 """
-Cas non-gérés:
+Cas non-gérés (anciens dossiers):
 - renvois en commision: http://www.assemblee-nationale.fr/14/dossiers/interdiction_prescription_acquisitive_voies_rurales.asp
 - senat ppl manquant: http://www.assemblee-nationale.fr/13/dossiers/comite_poids_mesures.asp
-- windows-1252 encoding: http://www.assemblee-nationale.fr/15/dossiers/responsabilite_financiere_dirigeants_benevoles_associations.asp
 """