In [1]:
import requests
from datetime import datetime
import pandas as pd
import json
import tweepy
import os

## On récupère la date du jour

In [2]:
today = datetime.now()
yesterday = today - pd.Timedelta(days=1)

## Quels sont les fichiers publiés en open data aujourd'hui ?

In [3]:
def retrieve_files_date(date):
    formated_date = date.strftime('%Y-%m-%d')
    try:
        return(pd
            .read_csv(
                f'https://www.assemblee-nationale.fr/dyn/opendata/list-publication/publication_{formated_date}.csv', 
                sep=';', 
                names=['date', 'url']
            )
        )
    except:
        return(pd.DataFrame(columns=['date', 'url']))

files = pd.concat([retrieve_files_date(date) for date in [yesterday, today]], ignore_index=True)

## On isole tous les rapports d'information

- L'id du rapport permet d'accéder au fichier de description json 
- Si l'id est différent du full_id, cela veut dire que c'est un rapport en plusieurs parties. Il faudra trouver la correspondance dans le fichier de description json

In [4]:
ris = (files
    # les pdf d'un rapport d'information
    .loc[files['url'].str.contains(r'RIN.+\.pdf$')]    
    .assign(
        full_id = lambda x: x['url'].str.extract(pat = '(RIN.+)\.pdf'),        
    )
)

# https://www.codegrepper.com/code-examples/javascript/regex+match+anything+except+character
ris[['id', 'tome']] =  ris['full_id'].str.extract(pat = '(RIN[^-]+)-?(.*)', expand=True)

ris

Unnamed: 0,date,url,full_id,id,tome


In [5]:
descriptions = {}
for id in ris['id'].unique():
    url = f'https://www.assemblee-nationale.fr/dyn/opendata/{id}.json'
    response = requests.get(url)
    if response.status_code == 200:
        descriptions[id] = json.loads(response.text)
    else:
        print(f'{id} not found')

Pour le moment, on va juste mettre la description de base dans la df

In [6]:
def get_desc(id):
    return descriptions[id]['titres']['titrePrincipalCourt']

ris = ris.assign(
    description = lambda x: x['id'].apply(get_desc),
)

ris

Unnamed: 0,date,url,full_id,id,tome,description


Quels sont les nouveaux RI ?   

In [7]:
try:
    old_ris = pd.read_csv('ris.csv', sep=';')
except:
    old_ris = pd.DataFrame({'full_id': []})

new_ris = ris.loc[~ris['full_id'].isin(old_ris['full_id'])]    
new_ris

Unnamed: 0,date,url,full_id,id,tome,description


## On sauvegarde les nouveaux rapports

In [8]:
pd.concat([old_ris, new_ris], ignore_index=True).to_csv('ris.csv', sep=';', index=False)

## On tweete les nouveaux rapports

In [9]:
try:
    credentials = json.load(open('../twitter-credentials.json'))
except:
    # gh actions secrets
    credentials = {key: os.environ[key] for key in ["TWITTER_API_KEY", "TWITTER_API_SECRET", "TWITTER_ACCESS_KEY", "TWITTER_ACCESS_SECRET"]}

In [10]:
auth = tweepy.OAuthHandler(credentials['TWITTER_API_KEY'], credentials['TWITTER_API_SECRET'])
auth.set_access_token(credentials['TWITTER_ACCESS_KEY'], credentials['TWITTER_ACCESS_SECRET'])
api = tweepy.API(auth)

try:
    api.verify_credentials()
    print("Authentication OK")
except Exception as e:
    print(e)
    print("Error during authentication")

Authentication OK


In [11]:
for index, row in new_ris.iterrows():
    tweet = f'ℹ️ • Nouveau rapport d\'information {row["url"]} {row["description"]}'
    if len(tweet) > 280:
        tweet = tweet[0:277] + '...'
    api.update_status(tweet)