# Scraping

## Init

In [1]:
import os
import sys
sys.path.insert(0, "../../")

from core import database, util
from core.data.raw import Scrapper, RawDataRepo
from core.data.raw.entities import RawData

database.init()

## Scraping

In [2]:
scrapper = Scrapper()

### scrap_params

In [3]:
params = scrapper.scrap_params()
params

<core.data.raw.entities.Params at 0x8349b08>

In [4]:
params.kabko[:5]

['',
 '- (STATUS PENDING)',
 'AWAK BUAH KAPAL',
 'KAB. BANGKALAN',
 'KAB. BANYUWANGI']

### scrap

In [5]:
result = scrapper.scrap('KOTA SURABAYA', '2020-06-01')
result

<core.data.raw.entities.RawData at 0x9090988>

In [6]:
result.odp.total

3711

In [7]:
result.pdp.total

3057

In [8]:
result.positif.total

2633

### scrap_bulk

In [9]:
results = scrapper.scrap_bulk(params.kabko, ['2020-06-01'])
results[:5]

[<core.data.raw.entities.RawData at 0x90e7d48>,
 <core.data.raw.entities.RawData at 0x90e9dc8>,
 <core.data.raw.entities.RawData at 0x90e6c88>,
 <core.data.raw.entities.RawData at 0x90e75c8>,
 <core.data.raw.entities.RawData at 0x90e61c8>]

In [10]:
[x.total() for x in results][:5]

[568256, 9, 19, 21452, 9512]

## Storage

### Entities Database Helper Methods

In [11]:
db_row = result.to_db_row()

In [12]:
keyword_row = RawData.from_db_row(db_row)

In [13]:
result2 = RawData(**keyword_row)

In [14]:
result2.to_db_row() == db_row

True

### Database

In [15]:
with database.get_conn() as conn, conn.cursor() as cur:
    cur.execute("""
        SELECT * FROM main.raw_covid_data LIMIT 5
    """)
    
    print(cur.fetchall())

[]


### fetch_kabko

In [16]:
RawDataRepo.fetch_kabko()[:5]

['- (STATUS PENDING)',
 'AWAK BUAH KAPAL',
 'KAB. BANGKALAN',
 'KAB. BANYUWANGI',
 'KAB. BLITAR']

### fetch_kabko_dict

In [17]:
RawDataRepo.fetch_kabko_dict()[""]

'JAWA TIMUR'

### save_data

In [18]:
RawDataRepo.save_data([result.to_db_row()])

### fetch_data

In [19]:
len(RawDataRepo.fetch_data("KOTA SURABAYA"))

1

### get_latest_tanggal

Returns None if table is empty.

In [22]:
RawDataRepo.get_latest_tanggal()

## Filling the data

In [34]:
tanggal = util.filter_dates_after(params.tanggal, RawDataRepo.get_latest_tanggal())
tanggal

[]

In [35]:
for t in tanggal:
    data = scrapper.scrap_bulk(params.kabko, [t])
    RawDataRepo.save_data([d.to_db_row() for d in data])
    print("Done: " + t)

## Trimming Early Zero Data

In [2]:
RawDataRepo.get_oldest_tanggal("AWAK BUAH KAPAL")

datetime.date(2020, 5, 17)

In [3]:
RawDataRepo.trim_early_zeros()

0