In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import twitter
import internetarchive as ia
import requests
import json
import os
import re
import datetime as dt
from git import Repo
import locale
locale.setlocale(locale.LC_TIME, 'es_US.UTF8');

In [2]:
datos_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vRZv851TVHTZd99eke7VVb3tchFjrp1pwmmK0ipQruVoAHovoDe8_VMgQtDZIPckn6_Aiu5Hux_ACzq/pub?gid=0&single=true&output=csv"
metadatos_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRZv851TVHTZd99eke7VVb3tchFjrp1pwmmK0ipQruVoAHovoDe8_VMgQtDZIPckn6_Aiu5Hux_ACzq/pub?gid=1026774595&single=true&output=csv'
directory = 'reportes'

In [3]:
def archivar_reportes():
    
    def is_twitter(url):
        return len(re.findall('twitter.com\/', url)) > 0

    def is_unidoscontraelcovid(url):
        return len(re.findall('unidoscontraelcovid.gob.bo\/', url)) > 0

    def read_metadata():
        meta = pd.read_csv(metadatos_url, parse_dates=['fecha'])
        meta = meta[meta.fuente.notna()]
        return meta

    def filter_new_metadata(meta):
        archived = [dt.datetime.strptime(filename.split('.')[0].split('_')[1], '%Y%m%d') for filename in os.listdir(directory)]
        return meta[~meta.fecha.isin(archived)]

    def archive(ia_id, description, fuente, creator, fecha, filename, image_url):
        with open(filename, 'wb') as f:
            f.write(requests.get(image_url).content)

        ia_meta = {'title': 'Reporte de Vacunación de Covid-19 en Bolivia para el {}'.format(fecha.strftime('%-d de %B, %Y')),
                   'description': description,
                   'source': fuente,
                   'creator': creator,
                   'mediatype': 'image',
                   'collection': 'vacunacion-covid19-bolivia',
                   'date': fecha.strftime('%Y-%m-%d')}

        ia.upload(ia_id,
                  filename,
                  metadata=ia_meta,
                  access_key = credenciales["ia"]["access"],
                  secret_key = credenciales["ia"]["secret"],
                  retries=4
                 )

    def load_credenciales():
            with open('credenciales.json', 'r') as f:
                return json.load(f)

    def auth_twitter():
            "Autenticar en Twitter para realizar consultas al API"

            return twitter.Api(consumer_key = credenciales['twitter']['consumer_key'],
                               consumer_secret = credenciales['twitter']['consumer_secret'],
                               access_token_key = credenciales['twitter']['access_token_key'],
                               access_token_secret = credenciales['twitter']['access_token_secret'],
                               tweet_mode='extended')

    def make_ia_id(fecha):
        return 'vacunacioncovid19bolivia_{}'.format(fecha.strftime('%Y%m%d'))

    def make_filename(ia_id, image_url):
        return '{}/{}.{}'.format(directory, ia_id, image_url.split('.')[-1])

    def download_image(filename, image_url):
        with open(filename, 'wb') as f:
            f.write(requests.get(image_url).content)

    def get_twitter(url):
        status_id = int(url.split('/')[-1])
        status = tw.GetStatus(status_id)
        description = status.full_text
        fuente = url
        creator = status.user.name
        image_url = status._json['entities']['media'][0]['media_url_https']
        return description, fuente, creator, image_url

    def get_unidoscontraelcovid(url, fecha):
        html = BeautifulSoup(requests.get(url).text, 'html.parser')
        description = 'Reporte de Vacunación de Covid-19 en Bolivia para el {}'.format(fecha.strftime('%-d de %B, %Y')),
        fuente = url
        creator = 'Unidos Contra el Covid'
        image_url = html.select('.post img')[0]['src']
        return description, fuente, creator, image_url
    
    credenciales = load_credenciales()
    tw = auth_twitter()

    meta = filter_new_metadata(read_metadata())
    print('{} reportes que archivar'.format(len(meta)))

    for i, row in meta.iterrows():
        
        fuente = row['fuente']
        fecha = row['fecha']
        print('{} - {}'.format(fecha.strftime('%Y-%m-%d'), fuente))
        
        if is_twitter(row['fuente']):
            description, fuente, creator, image_url = get_twitter(fuente)
        
        elif is_unidoscontraelcovid(row['fuente']):
            description, fuente, creator, image_url = get_unidoscontraelcovid(fuente, fecha)
        
        ia_id = make_ia_id(fecha)
        filename = make_filename(ia_id, image_url)
        download_image(filename, image_url)
        
        archive(ia_id, description, fuente, creator, fecha, filename, image_url)

def consolidar():
    """
    Sincroniza sheets
    """
    
    def download_sheet(url, filename):
        datos = requests.get(url).text
        datos = datos.replace('\r','')
        with open(filename, 'w+') as f:
            f.write(datos)
            
    for url, filename in zip([metadatos_url, datos_url], ['metadata.csv', 'vaccinations.csv']):
        download_sheet(url, filename)
        
def update():
    """
    Actualiza el repositorio
    """
    
    last_update = pd.read_csv('metadata.csv', parse_dates=['fecha']).fecha.max()
    repository = Repo('.')
    repository.git.add(all=True)
    repository.index.commit(last_update.strftime('%Y-%m-%d'))
    repository.remotes.origin.push()

In [None]:
archivar_reportes()
consolidar()
update()

16 reportes que archivar
2022-06-23 - https://twitter.com/notiAsolysombra/status/1540162867072212995
2022-06-24 - https://twitter.com/notiAsolysombra/status/1540537491215761408
2022-06-25 - https://twitter.com/notiAsolysombra/status/1540907293729902592
2022-06-26 - https://twitter.com/notiAsolysombra/status/1541252705259327490
2022-06-27 - https://twitter.com/notiAsolysombra/status/1541616569641574402
2022-06-28 - https://twitter.com/notiAsolysombra/status/1541984509863907329
2022-06-29 - https://twitter.com/notiAsolysombra/status/1542348092162424832
2022-06-30 - https://www.boliviatv.bo/principal/nota.php?noticia=12ea44692783b6694f613c1639e29c2e
2022-07-02 - https://twitter.com/jornada7bolivia/status/1543455917475352582
2022-07-03 - https://twitter.com/ahora_elpueblo/status/1543770378165800960
2022-07-04 - https://twitter.com/notiAsolysombra/status/1544149001217400836
2022-07-05 - https://twitter.com/notiAsolysombra/status/1544519825858387968
2022-07-06 - https://twitter.com/notiAsoly