# Scraping

## Init

In [1]:
import os
import sys
sys.path.insert(0, "../../")

from core import database, util
from core.data.raw import Scrapper, RawDataRepo
from core.data.raw.entities import RawData

database.init()

## Scraping

In [2]:
scrapper = Scrapper()

### scrap_params

In [3]:
params = scrapper.scrap_params()
params

<core.data.raw.entities.Params at 0x8570848>

In [4]:
params.kabko[:5]

['',
 '- (STATUS PENDING)',
 'AWAK BUAH KAPAL',
 'KAB. BANGKALAN',
 'KAB. BANYUWANGI']

### scrap

In [5]:
result = scrapper.scrap('KOTA SURABAYA', '2020-06-01')
result

<core.data.raw.entities.RawData at 0x941cc08>

In [6]:
result.odp.total

3711

In [7]:
result.pdp.total

3057

In [8]:
result.positif.total

2633

### scrap_bulk

In [9]:
results = scrapper.scrap_bulk(params.kabko, ['2020-06-01'])
results[:5]

[<core.data.raw.entities.RawData at 0x94d3588>,
 <core.data.raw.entities.RawData at 0x94d6288>,
 <core.data.raw.entities.RawData at 0x94cfac8>,
 <core.data.raw.entities.RawData at 0x94d3288>,
 <core.data.raw.entities.RawData at 0x94cfc08>]

In [10]:
[x.total() for x in results][:5]

[513175410, 9, 19, 21452, 9512]

## Storage

### Entities Database Helper Methods

In [11]:
db_row = result.to_db_row()

In [12]:
keyword_row = RawData.from_db_row(db_row)

In [13]:
result2 = RawData(**keyword_row)

In [14]:
result2.to_db_row() == db_row

True

### Database

In [15]:
with database.get_conn() as conn, conn.cursor() as cur:
    cur.execute("""
        SELECT COUNT(*) AS COUNT FROM main.raw_covid_data
    """)
    count = cur.fetchone()[0]
    
    print(count)

0


### fetch_kabko

In [16]:
kabko = RawDataRepo.fetch_kabko()
kabko[:5]

['', 'AWAK BUAH KAPAL', 'KAB. BANGKALAN', 'KAB. BANYUWANGI', 'KAB. BLITAR']

### fetch_kabko_dict

In [17]:
RawDataRepo.fetch_kabko_dict()[""]

'JAWA TIMUR'

### save_data

In [18]:
RawDataRepo.save_data([result.to_db_row()])

### fetch_data

In [19]:
len(RawDataRepo.fetch_data("KOTA SURABAYA"))

1

### get_latest_tanggal

Returns None if table is empty.

In [20]:
RawDataRepo.get_latest_tanggal()

datetime.date(2020, 6, 1)

### Cleanup

In [25]:
with database.get_conn() as conn, conn.cursor() as cur:
    cur.execute("""
        SELECT COUNT(*) AS COUNT FROM main.raw_covid_data
    """)
    count = cur.fetchone()[0]
    if count == 1:
        cur.execute("""
            DELETE FROM main.raw_covid_data
        """)
    conn.commit()

## Filling the data

In [32]:
def fill_data(kabko, tanggal):
    tanggal = util.filter_dates_after(tanggal, RawDataRepo.get_latest_tanggal())
    l = len(tanggal)
    if l == 0:
        print("No new data")
        return
    print("Filling %d days worth of data, from %s to %s." % (l, tanggal[0], tanggal[-1]))
    for t in tanggal:
        data = scrapper.scrap_bulk(kabko, [t])
        RawDataRepo.save_data([d.to_db_row() for d in data])
        print("Done: " + t)

In [33]:
import traceback
while True:
    try:
        fill_data(kabko, params.tanggal)
        break
    except ConnectionError as ex:
        traceback.print_exc()

Filling 17 days worth of data, from 2020-06-22 to 2020-07-08.
Done: 2020-06-22
Done: 2020-06-23
Done: 2020-06-24
Done: 2020-06-25
Done: 2020-06-26
Done: 2020-06-27
Done: 2020-06-28
Done: 2020-06-29
Done: 2020-06-30
Done: 2020-07-01
Done: 2020-07-02
Done: 2020-07-03
Done: 2020-07-04
Done: 2020-07-05
Done: 2020-07-06
Done: 2020-07-07
Done: 2020-07-08


## Trimming Early Zero Data

In [34]:
RawDataRepo.get_oldest_tanggal("AWAK BUAH KAPAL")

datetime.date(2020, 5, 17)

In [35]:
RawDataRepo.trim_early_zeros()

126