In [1]:
from facebook_scraper import get_posts
import json
import pandas as pd
import twitter
import internetarchive as ia
import requests
import os
import datetime as dt
from git import Repo
import warnings
from bs4 import BeautifulSoup
import locale
locale.setlocale(locale.LC_TIME, 'es_US.UTF8');

In [2]:
datos_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSYiPWLCf9_HWB51cIryodmVlDg4RQS_AeccuJjkz9xVtDx7Y5tfk9RaF94BrVdO5vakBo5dUktZawh/pub?gid=0&single=true&output=csv"
metadatos_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSYiPWLCf9_HWB51cIryodmVlDg4RQS_AeccuJjkz9xVtDx7Y5tfk9RaF94BrVdO5vakBo5dUktZawh/pub?gid=1875795980&single=true&output=csv"
directory = 'reportes'
casos_diarios_url = 'https://raw.githubusercontent.com/sociedatos/covid19-bo-casos_por_departamento/master/confirmados_diarios.csv'

In [3]:
def load_credenciales():
    with open('credenciales.json', 'r') as f:
        return json.load(f)
    
def auth_twitter():
    "Autenticar en Twitter para realizar consultas al API"

    return twitter.Api(consumer_key = credenciales['twitter']['consumer_key'],
                       consumer_secret = credenciales['twitter']['consumer_secret'],
                       access_token_key = credenciales['twitter']['access_token_key'],
                       access_token_secret = credenciales['twitter']['access_token_secret'],
                       tweet_mode='extended')

def reportes_por_archivar():
    sheet = pd.read_csv(metadatos_url, parse_dates=['fecha'], index_col=['fecha'])
    tabla = pd.DataFrame([{'filename': '{}_{}'.format(i[1].lower().replace(' ', '_'), i[0].strftime('%Y%m%d')), 'url': row, 'dep': i[1], 'fecha': i[0]} for i, row in sheet.stack().iteritems()])
    archivados = [filename.split('.')[0].replace('reportesdepartamentalescovid19bolivia_', '') for filename in os.listdir(directory)]
    nuevos = pd.DataFrame([{'url': row['url'], 'dep': row['dep'], 'fecha': row['fecha']} for i, row in tabla.iterrows() if row['filename'] not in archivados])
    return nuevos

def create_identifier(dep, fecha):
    return 'reportesdepartamentalescovid19bolivia_{}_{}'.format(dep.lower().replace(' ', '_'), fecha.strftime('%Y%m%d'))

def save_image(url, filename):
    with open(filename, 'wb') as f:
        f.write(requests.get(url).content)

def parse_facebook(url, dep, fecha):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        post = [post for post in get_posts(post_urls=[url], cookies='cookies.json')][0]
    content_type = requests.head(post['images'][0]).headers.get('content-type')
    is_image = 'image' in content_type
    post_meta = {
        "source": post['post_url'],
        "creator": post['username'],
        "description": post['text'],
    }
    identifier = create_identifier(dep, fecha)
    filename = '{}/{}.jpeg'.format(directory, identifier)
    if is_image:
        save_image(post['images'][0], filename)
    return is_image, post_meta, identifier, filename

def parse_twitter(url, dep, fecha):
    tweet_id = url.split('/')[-1]
    statuses = tw.GetStatuses([tweet_id])
    status = statuses[0]._json
    post_meta = {
        "source": url,
        "creator": status['user']['name'],
        "description": status['full_text']
    }
    identifier = create_identifier(dep, fecha)
    media = status['entities']['media'][0]['media_url']
    filename = '{}/{}.{}'.format(directory, identifier, media.split('.')[-1])
    save_image(media, filename)
    return True, post_meta, identifier, filename

def upload_archive(dep, fecha, post_meta, identifier, filename):

    ia_meta = {'title': 'Reporte de Covid-19 en {}, Bolivia para el {}'.format(dep, fecha.strftime('%-d de %B, %Y')),
               'description': post_meta['description'],
               'source': post_meta['source'],
               'collection': 'covid19-bolivia-departamentos',
               'creator': post_meta['creator'],
               'mediatype': 'image',
               'date': fecha.strftime('%Y-%m-%d')}

    ia.upload(identifier,
              filename,
              metadata=ia_meta,
              access_key = credenciales["ia"]["access"],
              secret_key = credenciales["ia"]["secret"],
              retries=4
             )
    print('https://archive.org/details/{} : {}'.format(identifier, post_meta['source']))

def parse_unidoscontraelcovid(url, dep, fecha):
    
    identifier = create_identifier(dep, fecha)
    post_meta = {
        "source": url,
        "creator": 'Unidos Contra el Covid',
        "description": 'Reporte de Covid-19 en Bolivia para el {}'.format(fecha.strftime('%-d de %B, %Y')),
    }
    html = BeautifulSoup(requests.get(url).text, 'html.parser')
    media = html.select('.post img')[0]['src']
    filename = '{}/{}.{}'.format(directory, identifier, media.split('.')[-1])
    save_image(media, filename)

    return True, post_meta, identifier, filename
    
def archivar(enlaces):
    for i, row in enlaces.iterrows():
        try:
            print('{} - {} - {}'.format(row['dep'], row['fecha'].strftime('%Y-%m-%d'), row['url']))
            if 'facebook.com' in row['url']:
                is_image, post_meta, identifier, filename = parse_facebook(row['url'], row['dep'], row['fecha'])
            if 'twitter.com' in row['url']:
                is_image, post_meta, identifier, filename = parse_twitter(row['url'], row['dep'], row['fecha'])
            if 'unidoscontraelcovid.gob.bo' in row['url']:
                is_image, post_meta, identifier, filename = parse_unidoscontraelcovid(row['url'], row['dep'], row['fecha'])
                
            if is_image:
                upload_archive(row['dep'], row['fecha'], post_meta, identifier, filename)
        except Exception as e:
            print(e)

In [5]:
credenciales = load_credenciales()
tw = auth_twitter()
enlaces = reportes_por_archivar()
archivar(enlaces)

Beni - 2022-02-27 - https://www.facebook.com/SedesBeni2021/posts/1908896469318126
Content Not Found
Beni - 2022-03-31 - https://www.facebook.com/SedesBeni2021/posts/1931815503692889
Content Not Found
Chuquisaca - 2022-04-08 - https://www.facebook.com/SEDESCh/posts/82129532260722
500 Server Error: Internal Server Error for url: https://m.facebook.com/SEDESCh/posts/82129532260722?locale=en_US
Pando - 2022-04-08 - https://www.facebook.com/SaludDePando/posts/17139572156183733
Content Not Found
Bolivia - 2022-06-04 - https://twitter.com/ahora_elpueblo/status/1533285631111069696
https://archive.org/details/reportesdepartamentalescovid19bolivia_bolivia_20220604 : https://twitter.com/ahora_elpueblo/status/1533285631111069696
Bolivia - 2022-06-05 - https://twitter.com/noticiasfides/status/1533643593679835137
https://archive.org/details/reportesdepartamentalescovid19bolivia_bolivia_20220605 : https://twitter.com/noticiasfides/status/1533643593679835137
Tarija - 2022-06-08 - https://twitter.com/E

In [6]:
def consolidar():
    """
    Sincroniza sheets
    """
    
    def download_sheet(url, filename):
        datos = requests.get(url).text
        datos = datos.replace('\r','')
        with open(filename, 'w+') as f:
            f.write(datos)
            
    for url, filename in zip([metadatos_url, datos_url], ['metadata.csv', 'descartados.csv']):
        download_sheet(url, filename)

def format_descartados(dep_cols):
    
    start = '2022-01-21'
    descartados = pd.read_csv('descartados.csv', parse_dates=['Fecha'], index_col=['Fecha'])

    # Llenar el departamento faltante (usualmente Santa Cruz)
    for fecha, row in descartados.loc[start:].iterrows():
        if row[dep_cols].isna().sum() == 1:
            missing = row["Bolivia"] - row[dep_cols].sum()
            if missing > 0:
                row.loc[row[row.isna()].index] = missing
    
    # Descartar datos cuando el valor de descartados diarios es menor a 0
    for dep in dep_cols:
        descartados[dep].loc[descartados[dep].diff() < 0] = None
    
    return descartados

# def format_pruebas(casos, descartados, dep_cols):
    # pruebas = (casos[dep_cols] + descartados[dep_cols].interpolate()).dropna()
    # empty = pd.DataFrame(index=pd.date_range(pruebas.index.min(), pruebas.index.max()), columns=dep_cols)
    # pruebas = pd.concat([pruebas, empty])
    # pruebas = pruebas[~pruebas.index.duplicated()].sort_index()
    # pruebas = pruebas.astype(float).interpolate().diff().dropna().astype(int)
    # return pruebas

def format_results():
    dep_cols = ['La Paz', 'Cochabamba', 'Santa Cruz', 'Oruro', 'Potosi', 'Tarija', 'Chuquisaca', 'Beni', 'Pando']
    descartados = format_descartados(dep_cols)
    casos = pd.read_csv(casos_diarios_url, parse_dates=[0], index_col=0)
    casos = casos.rename(columns={'Potosí':'Potosi'})
    common_index = casos.index.intersection(descartados.index)
    # pruebas = format_pruebas(casos, descartados, dep_cols)
    pruebas = (casos.loc[common_index][dep_cols] + descartados.diff().loc[common_index][dep_cols])
    # positividad = (casos.diff().dropna() / pruebas).dropna()
    positividad = casos.loc[common_index][dep_cols] / pruebas.loc[common_index][dep_cols]
    
    # Fix days where no tests were run
    # for col in pruebas.columns:
    #     for i in pruebas.loc[pruebas[col] == 0].index:
    #         positividad.loc[i, col] = 0
    
    return descartados, pruebas, positividad

def save_results(descartados, pruebas, positividad):
    descartados.to_csv('descartados.csv', float_format="%.0f")
    pruebas.to_csv('pruebas.csv', float_format="%.0f")
    positividad.to_csv('positividad.csv', float_format="%.3f")

In [7]:
consolidar()
descartados, pruebas, positividad = format_results()
save_results(descartados, pruebas, positividad)

In [7]:
def update():
    """
    Actualiza el repositorio
    """
    
    last_update = pd.read_csv('metadata.csv', parse_dates=['fecha']).fecha.max()
    repository = Repo('.')
    repository.git.add(all=True)
    repository.index.commit(last_update.strftime('%Y-%m-%d'))
    repository.remotes.origin.push()

In [8]:
update()