-
Notifications
You must be signed in to change notification settings - Fork 5
/
dossier_like_senapy.py
413 lines (346 loc) · 16.7 KB
/
dossier_like_senapy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
# re-write of the dosleg parser to have
# the same output as senapy
import sys
import re
from urllib.parse import urljoin
import dateparser
from bs4 import BeautifulSoup
from lawfactory_utils.urls import clean_url, download, parse_national_assembly_url, AN_OLD_URL_TEMPLATE
from anpy.dossier_from_opendata import parse as opendata_parse
def format_date(date):
parsed = dateparser.parse(date, languages=['fr'])
return parsed.strftime("%Y-%m-%d")
def find_promulgation_date(line):
"""
>>> find_promulgation_date("Loi nº 2010-383 du 16 avril 2010 autorisant l'approbation de l'accord entre...")
'2010-04-16'
"""
line = line.split(' du ')[1]
return format_date(re.search(r"(\d\d? \w\w\w+ \d\d\d\d)", line).group(1))
def find_senat_url(data):
if not data['steps']:
return
senat_text_url = [step['source_url'] for step in data['steps'] if step.get('source_url') and 'senat.fr' in step.get('source_url')]
for url in senat_text_url:
html = download(url).text
soup = BeautifulSoup(html, 'lxml')
for a in soup.select('#primary a'):
href = urljoin(url, a.attrs.get('href', ''))
if 'dossier-legislatif/' in href or 'dossierleg/' in href:
return clean_url(href)
def download_historic_dosleg(url):
resp = download(url)
if '/dyn/' in resp.url:
# fallback to backed-up doslegs when the redirect is forced
legislature, slug = parse_national_assembly_url(url)
display_url = AN_OLD_URL_TEMPLATE.format(legislature=legislature, slug=slug)
download_url = 'https://raw.githubusercontent.com/regardscitoyens/archive-AN-doslegs/master/archive/' \
+ display_url.split('.fr/')[1]
resp = download(download_url)
resp.url = display_url
resp.encoding = 'Windows-1252'
return resp
def merge_previous_works_an(older_dos, dos):
# remove promulgation step
if older_dos['steps'] and older_dos['steps'][-1].get('stage') == 'promulgation':
older_dos['steps'] = older_dos['steps'][:-1]
for i in range(len(older_dos['steps'])):
if older_dos['steps'][-i].get('source_url') == dos['steps'][0].get('source_url'):
dos['steps'] = older_dos['steps'][:-i] + dos['steps']
break
else:
dos['steps'] = older_dos['steps'] + dos['steps']
if 'url_dossier_senat' in older_dos and 'url_dossier_senat' not in dos:
dos['url_dossier_senat'] = older_dos['url_dossier_senat']
return dos
def historic_doslegs_parse(html, url_an=None, logfile=sys.stderr, nth_dos_in_page=0, parse_previous_works=True, parse_next_works=True):
"""
Parse an AN dosleg like http://www.assemblee-nationale.fr/13/dossiers/accord_Montenegro_mobilite_jeunes.asp
nth_dos_in_page, parse_previous_works and parse_next_works are for internal logic
"""
data = {
'url_dossier_assemblee': clean_url(url_an),
'urgence': False,
}
def log_error(*error):
print('## ERROR ###', *error, file=logfile)
def log_warning(*error):
print('## WARNING ###', *error, file=logfile)
soup = BeautifulSoup(html, 'lxml')
legislature, slug = parse_national_assembly_url(data['url_dossier_assemblee'])
data['assemblee_slug'] = slug
if legislature:
data['assemblee_legislature'] = legislature
else: # strange link (old dosleg)
log_error('NO LEGISLATURE IN AN LINK: ' + data['url_dossier_assemblee'])
data['assemblee_id'] = '%s-%s' % (data.get('assemblee_legislature', ''), data['assemblee_slug'])
data['steps'] = []
curr_institution = 'assemblee'
curr_stage = '1ère lecture'
last_section = None # Travaux des commissions/Discussion en séance publique
last_step_index = 0
travaux_prep_already = False
another_dosleg_inside = None
predicted_next_step = None # For unfinished projects, we try to catch the next step
previous_works = None
url_jo = None
html_lines = html.split('\n')
for i, line in enumerate(html_lines):
def parse_line():
return BeautifulSoup(line, 'lxml')
def line_text():
return parse_line().text.strip()
def get_last_step():
if len(data['steps']) > 0:
return data['steps'][-1]
return {}
if '<COMMENTAIRE>' in line or '<table border="1"' in line:
continue
if '<font face="ARIAL" size="3" color="#000080">' in line:
data['long_title'] = line_text()
if '<br><b><font color="#000099">Travaux des commissions</font></b><br>' in line:
last_section = line_text()
if '<p align="center"><b><font color="#000080">Travaux préparatoires</font></b><br>' in line:
if travaux_prep_already:
if parse_next_works and not nth_dos_in_page:
log_warning('FOUND ANOTHER DOSLEG INSIDE THE DOSLEG')
another_dosleg_inside = '\n'.join(html.split('\n')[last_step_index + 1:])
if not nth_dos_in_page:
break
travaux_prep_already = False
else:
travaux_prep_already = True
if not parse_next_works and travaux_prep_already and nth_dos_in_page:
continue
# Senat 1ère lecture, CMP, ...
if '<font color="#000099" size="2" face="Arial">' in line:
text = line_text()
last_section = None
if 'Dossier en ligne sur le site du Sénat' in text:
data['url_dossier_senat'] = clean_url(parse_line().select(
'a')[-1].attrs['href'])
text = text.replace(
'(Dossier en ligne sur le site du Sénat)', '')
if 'Sénat' in text:
curr_institution = 'senat'
elif 'Assemblée nationale' in text:
curr_institution = 'assemblee'
elif 'Commission Mixte Paritaire' in text or 'Lecture texte CMP' in text:
curr_institution = 'CMP'
curr_stage = 'CMP'
elif 'Conseil Constitutionnel' in text:
curr_institution = 'conseil constitutionnel'
curr_stage = 'constitutionnalité'
elif 'Congrès du Parlement' in text:
curr_institution = 'congrès'
curr_stage = 'congrès'
if '1ère lecture' in text:
curr_stage = '1ère lecture'
elif '2e lecture' in text:
curr_stage = '2ème lecture'
elif 'Nouvelle lecture' in text:
curr_stage = 'nouv. lect.'
elif 'Lecture définitive' in text:
curr_stage = 'l. définitive'
if not curr_stage:
curr_stage = text.split('-')[-1].strip().lower()
if curr_stage == "création de la commission d'enquête":
log_warning('COMMISSION D\'ENQUETE')
return None
if '>Proposition de résolution européenne<' in line:
log_warning('PROPOSITION DE RESOLUTION EUROPEENE')
return None
if '>Accès aux Travaux préparatoires' in line and not previous_works:
previous_works = clean_url(urljoin(url_an, parse_line().find('a').attrs['href']))
curr_step = None
# conseil. consti. has no step but we should get the link
no_step_but_good_link = False
if 'Rapport portant également sur les propositions' in line:
continue
elif re.search(r'<a[^>]* href=[^>]*>(projet de loi|proposition de loi|proposition de résolution)', line, re.I):
curr_step = 'depot'
if curr_stage == 'CMP':
continue
elif ">Texte de la commission" in line or '/ta-commission/' in line:
curr_step = 'commission'
elif '/ta/' in line or '/leg/tas' in line:
if get_last_step().get('stage') != curr_stage:
curr_step = 'depot'
if curr_stage == 'CMP':
curr_step = 'commission'
else:
curr_step = 'hemicycle'
elif ('/rapports/' in line or '/rap/' in line) and last_section and 'commissions' in last_section:
if get_last_step().get('step') == 'commission':
# log_warning('DOUBLE COMMISSION LINE: %s' % line)
continue
curr_step = 'commission'
elif 'www.conseil-constitutionnel.fr/decision/' in line:
no_step_but_good_link = True
# no commissions for l. définitive
if curr_stage == 'l. définitive' and curr_step == 'commission':
continue
if curr_step or no_step_but_good_link:
# if same step previously, replace or not the url
if get_last_step().get('step') == curr_step:
# log_warning('DOUBLE STEP: %s' % line)
# remove last step since we prefer text links instead of reports links
# TODO: add report link as bonus_url
last_url = get_last_step().get('source_url')
if not last_url or ('/rapports/' in last_url or '/rap/' in last_url):
data['steps'] = data['steps'][:-1]
# looks like the last url was already a text, let's assume it's a multi-depot
else:
# multi-depot if not CMP
# TODO: re-order multi depot
if curr_institution == 'senat' and curr_stage != 'CMP':
curr_step = 'depot'
links = [a.attrs.get('href') for a in parse_line().select('a')]
links = [
href for href in links if href and 'fiches_id' not in href and '/senateur/' not in href and 'javascript:' not in href]
if not links:
log_error('NO LINK IN LINE: %s' % (line,))
continue
urls_raps = []
urls_others = []
for href in links:
if '/rap/' in href or '/rapports/' in href:
urls_raps.append(href)
else:
urls_others.append(href)
cmp_commission_other_url = None
if len(urls_others) > 0:
url = urls_others[0]
# CMP commission should produce two texts, one for each institution
if curr_step == 'commission' and curr_stage == 'CMP' and len(urls_others) > 1:
cmp_commission_other_url = clean_url(urljoin(url_an, urls_others[1]))
else:
url = urls_raps[0]
url = clean_url(urljoin(url_an, url))
real_institution = curr_institution
if curr_stage == 'CMP' and curr_step == 'hemicycle':
if 'assemblee-nationale.fr' in url:
real_institution = 'assemblee'
elif 'senat.fr' in url:
real_institution = 'senat'
step = {
'institution': real_institution,
'stage': curr_stage,
'source_url': url,
}
if curr_step:
step['step'] = curr_step
if cmp_commission_other_url:
step['cmp_commission_other_url'] = cmp_commission_other_url
# try to detect a date
for test_line in (line, html_lines[i-1]):
test_line = test_line.replace('1<sup>er</sup>', '1')
date_match = re.search(r'(déposée? le|adoptée? .*? le|modifiée? .*?|rejetée? .*?)\s*(\d\d? \w\w\w+ \d\d\d\d)', test_line, re.I)
if date_match:
step['date'] = format_date(date_match.group(2))
else:
date_match = re.search(r'(mis en ligne le)\s*(\d\d? \w\w\w+ \d\d\d\d)', test_line, re.I)
if date_match:
step['date'] = format_date(date_match.group(2))
if 'date' in step and 'beginning' not in data:
data['beginning'] = step['date']
data['steps'].append(step)
predicted_next_step = None
last_step_index = i
if 'publiée au Journal Officiel' in line and not url_jo:
links = [clean_url(a.attrs['href']) for a in parse_line().select('a') if 'legifrance' in a.attrs.get('href', '')]
if not links:
log_error('NO GOOD LINK IN LINE: %s' % (line,))
continue
url_jo = links[-1]
if 'Le Gouvernement a engagé la procédure accélérée' in line or 'engagement de la procédure accélérée' in line:
data['urgence'] = True
# Next step prediction via small clues
# TODO: this could be done via last_section (we parse two times the same thing)
# TODO: this fails for CMP hemicycle senat
if curr_stage != 'CMP':
if '>Discussion en séance publique<' in line:
predicted_next_step = {
'institution': curr_institution,
'stage': curr_stage,
'step': 'hemicycle',
}
elif '>Travaux des commissions<' in line:
predicted_next_step = {
'institution': curr_institution,
'stage': curr_stage,
'step': 'commission',
}
metas = {}
for meta in soup.select('meta'):
if 'name' in meta.attrs:
metas[meta.attrs['name']] = meta.attrs['content']
if not url_jo:
url_jo = metas.get('LIEN_LOI_PROMULGUEE')
if url_jo:
data['url_jo'] = clean_url(url_jo)
promulgation_step = {
'institution': 'gouvernement',
'stage': 'promulgation',
'source_url': data['url_jo'],
}
if metas.get('LOI_PROMULGUEE'):
data['end'] = find_promulgation_date(metas.get('LOI_PROMULGUEE'))
promulgation_step['date'] = data['end']
data['steps'].append(promulgation_step)
# add predicted next step for unfinished projects
elif predicted_next_step:
data['steps'].append(predicted_next_step)
if 'url_dossier_senat' not in data or 'dossier-legislatif' not in data['url_dossier_senat']:
senat_url = find_senat_url(data)
if senat_url:
data['url_dossier_senat'] = senat_url
# append previous works if there are some
if previous_works and parse_previous_works:
log_warning('MERGING %s WITH PREVIOUS WORKS %s' % (url_an, previous_works))
resp = download_historic_dosleg(previous_works)
prev_data = historic_doslegs_parse(
resp.text, previous_works,
logfile=logfile,
nth_dos_in_page=nth_dos_in_page, parse_next_works=False)
if prev_data:
prev_data = prev_data[nth_dos_in_page] if len(prev_data) > 1 else prev_data[0]
data = merge_previous_works_an(prev_data, data)
else:
log_warning('INVALID PREVIOUS WORKS', previous_works)
# is this part of a dosleg previous works ?
next_legislature = data['assemblee_legislature'] + 1 if 'assemblee_legislature' in data else 9999
if parse_next_works and next_legislature < 15:
# TODO: parse 15th legislature from open data if it exists
resp = download_historic_dosleg(url_an.replace('/%d/' % data['assemblee_legislature'], '/%d/' % (data['assemblee_legislature'] + 1)))
if resp.status_code == 200:
recent_data = historic_doslegs_parse(
resp.text, resp.url,
logfile=logfile,
nth_dos_in_page=nth_dos_in_page, parse_previous_works=False)
if recent_data:
log_warning('FOUND MORE RECENT WORKS', resp.url)
recent_data = recent_data[nth_dos_in_page] if len(recent_data) > 1 else recent_data[0]
data = merge_previous_works_an(data, recent_data)
if another_dosleg_inside:
others = historic_doslegs_parse(another_dosleg_inside, url_an, logfile=logfile, nth_dos_in_page=nth_dos_in_page+1)
if others:
return [data] + others
return [data]
def parse(url, logfile=sys.stderr, cached_opendata_an={}):
url = clean_url(url)
if '/dyn/' in url:
parsed = opendata_parse(url, logfile=logfile, cached_opendata_an=cached_opendata_an)
if parsed:
return [parsed]
print('WARNING: NOT FOUND IN OPEN-DATA', file=logfile)
resp = download_historic_dosleg(url)
if resp.status_code != 200:
print('WARNING: NOT FOUND IN HISTORIC DOSLEGS', file=logfile)
return []
return historic_doslegs_parse(resp.text, resp.url, logfile=logfile)
"""
Cas non-gérés (anciens dossiers):
- renvois en commision: http://www.assemblee-nationale.fr/14/dossiers/interdiction_prescription_acquisitive_voies_rurales.asp
- senat ppl manquant: http://www.assemblee-nationale.fr/13/dossiers/comite_poids_mesures.asp
"""