In [269]:
from facebook_scraper import get_posts
import json
import pandas as pd
import twitter
import internetarchive as ia
import requests
import os
import datetime as dt
from git import Repo
import warnings

In [290]:
datos_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSYiPWLCf9_HWB51cIryodmVlDg4RQS_AeccuJjkz9xVtDx7Y5tfk9RaF94BrVdO5vakBo5dUktZawh/pub?gid=0&single=true&output=csv"
metadatos_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSYiPWLCf9_HWB51cIryodmVlDg4RQS_AeccuJjkz9xVtDx7Y5tfk9RaF94BrVdO5vakBo5dUktZawh/pub?gid=1875795980&single=true&output=csv"
directory = 'reportes'
casos_acumulados_url = 'https://raw.githubusercontent.com/sociedatos/covid19-bo-casos_por_departamento/master/confirmados_acumulados.csv'

In [271]:
def load_credenciales():
    with open('credenciales.json', 'r') as f:
        return json.load(f)
    
def auth_twitter():
    "Autenticar en Twitter para realizar consultas al API"

    return twitter.Api(consumer_key = credenciales['twitter']['consumer_key'],
                       consumer_secret = credenciales['twitter']['consumer_secret'],
                       access_token_key = credenciales['twitter']['access_token_key'],
                       access_token_secret = credenciales['twitter']['access_token_secret'],
                       tweet_mode='extended')

def read_metadata():
    "Descarga los metadatos de google sheets, filtra sólo aquellos con una fuente y crea una columna de status_id"

    meta = pd.read_csv(metadatos_url, parse_dates=['fecha'], index_col=['fecha'])
    meta = meta.dropna(subset=['La Paz', 'Cochabamba', 'Oruro'])
    return meta

def filtrar_nuevos(meta):
    "Devuelve sólo las entradas que no hayan sido descargadas previamente."

    viejos = [dt.datetime.strptime(filename.split('.')[0].split('_')[-1], '%Y%m%d') for filename in os.listdir(directory)]
    return meta[~meta.index.isin(viejos)]

def create_identifier(dep, fecha):
    return 'reportesdepartamentalescovid19bolivia_{}_{}'.format(dep.lower().replace(' ', '_'), fecha.strftime('%Y%m%d'))

def save_image(url, filename):
    with open(filename, 'wb') as f:
        f.write(requests.get(url).content)

def parse_facebook(url, dep, fecha):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        post = [post for post in get_posts(post_urls=[url], cookies='cookies.json')][0]
    post_meta = {
        "source": post['post_url'],
        "creator": post['username'],
        "description": post['text'],
    }
    identifier = create_identifier(dep, fecha)
    filename = '{}/{}.jpeg'.format(directory, identifier)
    save_image(post['images'][0], filename)
    return post_meta, identifier, filename

def parse_twitter(url, dep, fecha):
    tweet_id = url.split('/')[-1]
    statuses = tw.GetStatuses([tweet_id])
    status = statuses[0]._json
    post_meta = {
        "source": url,
        "creator": status['user']['name'],
        "description": status['full_text']
    }
    identifier = create_identifier(dep, fecha)
    media = status['entities']['media'][0]['media_url']
    filename = '{}/{}.{}'.format(directory, identifier, media.split('.')[-1])
    save_image(media, filename)
    return post_meta, identifier, filename

def upload_archive(dep, fecha, post_meta, identifier, filename):

    ia_meta = {'title': 'Reporte de Covid-19 en {}, Bolivia para el {}'.format(dep, fecha.strftime('%-d de %B, %Y')),
               'description': post_meta['description'],
               'source': post_meta['source'],
               'creator': post_meta['creator'],
               'mediatype': 'image',
               'date': fecha.strftime('%Y-%m-%d')}

    ia.upload(identifier,
              filename,
              metadata=ia_meta,
              access_key = credenciales["ia"]["access"],
              secret_key = credenciales["ia"]["secret"])
    print('https://archive.org/details/{} : {}'.format(identifier, post_meta['source']))

def archivar(meta):
    for fecha, row in meta.iterrows():
        valores = row.to_dict()
        for dep in valores.keys():
            if type(valores[dep]) == str:
                url = valores[dep]
                if 'facebook.com' in url:
                    post_meta, identifier, filename = parse_facebook(url, dep, fecha)
                if 'twitter.com' in url:
                    post_meta, identifier, filename = parse_twitter(url, dep, fecha)
                upload_archive(dep, fecha, post_meta, identifier, filename)

In [272]:
credenciales = load_credenciales()
tw = auth_twitter()
meta = filtrar_nuevos(read_metadata())
archivar(meta)

https://archive.org/details/reportesdepartamentalescovid19bolivia_la_paz_20220122 : https://facebook.com/story.php?story_fbid=4973623952658597&id=1140266705994360
https://archive.org/details/reportesdepartamentalescovid19bolivia_cochabamba_20220122 : https://facebook.com/story.php?story_fbid=4772145992880098&id=957131637714905
https://archive.org/details/reportesdepartamentalescovid19bolivia_oruro_20220122 : https://facebook.com/story.php?story_fbid=301406578681673&id=100064370004661
https://archive.org/details/reportesdepartamentalescovid19bolivia_potosi_20220122 : https://facebook.com/story.php?story_fbid=479858346934035&id=110681273851746
https://archive.org/details/reportesdepartamentalescovid19bolivia_tarija_20220122 : https://facebook.com/story.php?story_fbid=995640174397993&id=396941987601151
https://archive.org/details/reportesdepartamentalescovid19bolivia_chuquisaca_20220122 : https://facebook.com/story.php?story_fbid=769072907829464&id=108534767216618
https://archive.org/deta

In [291]:
def consolidar():
    """
    Sincroniza sheets
    """
    
    def download_sheet(url, filename):
        datos = requests.get(url).text
        datos = datos.replace('\r','')
        with open(filename, 'w+') as f:
            f.write(datos)
            
    for url, filename in zip([metadatos_url, datos_url], ['metadata.csv', 'descartados.csv']):
        download_sheet(url, filename)

def format_descartados():
    
    start = '2022-01-21'
    descartados = pd.read_csv('descartados.csv', parse_dates=['Fecha'], index_col=['Fecha'])

    for fecha, row in descartados.loc[start:].iterrows():
        if row[dep_cols].isna().sum() == 1:
            missing = row["Bolivia"] - row[dep_cols].sum()
            row.loc[row[row.isna()].index] = missing
    
    return descartados

def format_pruebas():
    pruebas = (casos[dep_cols] + descartados[dep_cols].interpolate()).dropna()
    empty = pd.DataFrame(index=pd.date_range(pruebas.index.min(), pruebas.index.max()), columns=dep_cols)
    pruebas = pd.concat([pruebas, empty])
    pruebas = pruebas[~pruebas.index.duplicated()].sort_index()
    pruebas = pruebas.interpolate().diff().dropna().astype(int)
    return pruebas

def format_results():
    dep_cols = ['La Paz', 'Cochabamba', 'Santa Cruz', 'Oruro', 'Potosi', 'Tarija', 'Chuquisaca', 'Beni', 'Pando']
    descartados = format_descartados()
    casos = pd.read_csv(casos_acumulados_url, parse_dates=[0], index_col=0)
    casos = casos.rename(columns={'Potosí':'Potosi'})
    pruebas = format_pruebas()
    positividad = (casos.diff().dropna() / pruebas).dropna()
    return descartados, pruebas, positividad

def save_results(descartados, pruebas, positividad):
    descartados.to_csv('descartados.csv', float_format="%.0f")
    pruebas.to_csv('pruebas.csv', float_format="%.0f")
    positividad.to_csv('positividad.csv', float_format="%.3f")

In [292]:
consolidar()
descartados, pruebas, positividad = format_results()
save_results(descartados, pruebas, positividad)

In [294]:
pruebas

Unnamed: 0,La Paz,Cochabamba,Santa Cruz,Oruro,Potosi,Tarija,Chuquisaca,Beni,Pando
2020-05-08,128,52,157,8,14,13,1,81,3
2020-05-09,123,46,256,8,16,2,29,9,5
2020-05-10,71,17,347,20,29,0,27,7,5
2020-05-11,83,9,647,13,7,1,11,84,5
2020-05-12,103,117,260,18,6,22,0,69,0
...,...,...,...,...,...,...,...,...,...
2022-01-16,3563,1352,2971,1235,502,823,816,137,12
2022-01-17,2402,4133,8362,479,635,1787,2051,461,176
2022-01-18,3372,4434,8333,1439,739,2350,2369,1187,332
2022-01-19,4413,4549,8351,1803,711,2091,2052,794,331


In [268]:
def update():
    """
    Actualiza el repositorio
    """
    
    last_update = pd.read_csv('metadata.csv', parse_dates=['fecha']).fecha.max()
    repository = Repo('.')
    repository.git.add(all=True)
    repository.index.commit(last_update.strftime('%Y-%m-%d'))
    repository.remotes.origin.push()

In [274]:
update()