# Scraping

## Init

In [1]:
import os
import sys
#sys.path.insert(0, "../../")

from prediksicovidjatim import database, util
from prediksicovidjatim.data.raw import Scrapper, RawDataRepo
from prediksicovidjatim.data.raw.entities import RawData

database.init()
scrapper = None
params = None
kabko = None

## Scraping

In [20]:
scrapper = Scrapper()

### scrap_params

In [3]:
params = scrapper.scrap_params()
params

<core.data.raw.entities.Params at 0xbac2188>

In [4]:
params.kabko[:5]

['',
 '- (STATUS PENDING)',
 'AWAK BUAH KAPAL',
 'KAB. BANGKALAN',
 'KAB. BANYUWANGI']

### scrap

In [5]:
result = scrapper.scrap('KOTA SURABAYA', '2020-06-01')
result

<core.data.raw.entities.RawData at 0xceaf1c8>

In [6]:
result.odp.total

3711

In [7]:
result.pdp.total

3057

In [8]:
result.positif.total

2633

### scrap_bulk

In [None]:
results = scrapper.scrap_bulk(params.kabko, ['2020-06-01'])
results[:5]

In [None]:
[x.total() for x in results][:5]

## Storage

### Entities Database Helper Methods

In [None]:
db_row = result.to_db_row()

In [None]:
keyword_row = RawData.from_db_row(db_row)

In [None]:
result2 = RawData(**keyword_row)

In [None]:
result2.to_db_row() == db_row

### Database

In [None]:
with database.get_conn() as conn, conn.cursor() as cur:
    cur.execute("""
        SELECT COUNT(*) AS COUNT FROM main.raw_covid_data
    """)
    count = cur.fetchone()[0]
    
    print(count)

### fetch_kabko

In [4]:
kabko = RawDataRepo.fetch_kabko()
kabko[:5]

['KAB. BANGKALAN',
 'KAB. BANYUWANGI',
 'KAB. BLITAR',
 'KAB. BOJONEGORO',
 'KAB. BONDOWOSO']

### fetch_kabko_dict

In [None]:
RawDataRepo.fetch_kabko_dict()[""]

### save_data

RawDataRepo.save_data([result.to_db_row()])

### fetch_data

In [None]:
len(RawDataRepo.fetch_data("KOTA SURABAYA"))

### get_latest_tanggal

Returns None if table is empty.

In [6]:
RawDataRepo.get_latest_tanggal()

datetime.date(2020, 7, 8)

### Cleanup

with database.get_conn() as conn, conn.cursor() as cur:
    cur.execute("""
        SELECT COUNT(*) AS COUNT FROM main.raw_covid_data
    """)
    count = cur.fetchone()[0]
    if count == 1:
        cur.execute("""
            DELETE FROM main.raw_covid_data
        """)
    conn.commit()

## Filling the data

In [2]:
def fill_data(kabko, tanggal, max_process_count=4):
    tanggal = util.filter_dates_after(tanggal, RawDataRepo.get_latest_tanggal())
    l = len(tanggal)
    if l == 0:
        print("No new data")
        return
    print("Filling %d days worth of data, from %s to %s." % (l, tanggal[0], tanggal[-1]))
    for t in tanggal:
        data = scrapper.scrap_bulk(kabko, [t], max_process_count)
        RawDataRepo.save_data([d.to_db_row() for d in data])
        print("Done: " + t)

In [3]:
scrapper = scrapper or Scrapper()
params = params or scrapper.scrap_params()
kabko = kabko or RawDataRepo.fetch_kabko()

In [4]:
import traceback
while True:
    try:
        fill_data(kabko, params.tanggal)
        break
    except ConnectionError as ex:
        traceback.print_exc()

Filling 1 days worth of data, from 2020-07-26 to 2020-07-26.
Done: 2020-07-26


## Trimming Early Zero Data

RawDataRepo.trim_early_zeros()

## Hospital Capacity

The government released additional data to the site. It is the detailed hospital care data. We don't actually need thte details, just the total capacity for each kabko. It's also just showing the latest; we can't get old data. So we better scrap everyday to not miss capacity change. 

In [5]:
from prediksicovidjatim.data.kapasitas_rs import KapasitasRSScrapper
scrapper = KapasitasRSScrapper()
data = scrapper.scrap()
len(data)

39

In [6]:
from prediksicovidjatim.data.kapasitas_rs import KapasitasRSRepo
KapasitasRSRepo.save(data)

In [7]:
KapasitasRSRepo.fix_kapasitas()