# Preparation

## Import libraries

In [1]:
import requests
import re
import pandas as pd
import time
import pytz
from datetime import datetime
import urllib.request, json, os, os.path, csv

## Define function

In [2]:
def zonasi(row):
    if int(row['total_positif']) > 0:
        return('Merah')
    elif int(row['total_pdp']) > 0:
        return('Oranye')
    elif int(row['total_odp']) > 0:
        return('Hijau')
    else: return('Biru')

# Web Scrapping

In [3]:
# root = 'D:/Data Science/Python/COVID-19 Web Scraping'
tz = pytz.timezone('Asia/Jakarta')
default_date_update = datetime.now(tz).strftime("%d-%b-%Y")

format_header = ['scrape_date', 'date_update', 'waktu', 'provinsi', 'kabkot', 'kecamatan', 
                 'kelurahan', 'alamat', 'total_odp', 'total_pdp', 'total_positif', 'total_otg',
                 'odr_total', 'total_pp', 'total_ppdt', 'source_link', 'zona', 'Ket']

hulu_sungai_selatan_dataset = []

now = datetime.now(tz)

kecamatan_id = {'Angkinang':1, 'Daha Barat':2, 'Daha Selatan':3, 'Daha Utara':4, 'Kalumpang':5,
                'Kandangan':6, 'Loksado':7, 'Padang Batung':8, 'Simpur':9, 'Sungai Raya':10, 'Telaga Langsat':11}

for kecamatan in list(kecamatan_id.keys()):
    url = 'https://corona.hulusungaiselatankab.go.id/api/v1/informasi-kecamatan-terbaru/{0}' .format(kecamatan_id[kecamatan])

    with urllib.request.urlopen(url) as request:
        raw_data = json.loads(request.read().decode())
        new_corpus = {'scrape_date':now.strftime("%d-%b-%Y"),
                      'date_update':datetime.strptime(raw_data['data']['last_update'], '%Y-%m-%d %H:%M:%S').strftime('%d-%b-%Y'),
                      'waktu':now.strftime("%H:%M"),
                      'provinsi':'Kalimantan Selatan',
                      'kabkot':'Hulu Sungai Selatan',
                      'kecamatan':kecamatan,
                      'total_odp':int(raw_data['data']['odp_proses'])+int(raw_data['data']['odp_selesai']),
                      'total_pdp':int(raw_data['data']['pdp_dirawat'])+int(raw_data['data']['pdp_sembuh']),
                      'total_positif':int(raw_data['data']['positif_dirawat'])+int(raw_data['data']['positif_sembuh'])+int(raw_data['data']['positif_meninggal']),
                      'source_link':url}
    if new_corpus not in hulu_sungai_selatan_dataset: hulu_sungai_selatan_dataset.append(new_corpus)
    else: print('Identical corpus already exist in dataset.')

print('Length of dataset:', len(hulu_sungai_selatan_dataset)) #11

df = pd.DataFrame(columns=format_header)

for data in hulu_sungai_selatan_dataset:
    df = df.append(data, ignore_index=True)

num_nan = ['total_odp', 'total_pdp', 'total_positif', 'total_otg', 
           'odr_total', 'total_pp', 'total_ppdt']
df[num_nan] = df[num_nan].fillna(0)
str_nan = ['kecamatan', 'kelurahan', 'alamat', 'Ket']
df[str_nan] = df[str_nan].fillna('')

df['zona'] = df.apply(lambda row: zonasi(row), axis=1)

df.to_excel('{0}_Kecamatan_Hulu Sungai Selatan.xlsx' .format(datetime.now(tz).strftime("%Y%m%d")), index=False)

print('Shape of df:', df.shape)
df.head()

Length of dataset: 11
Shape of df: (11, 18)


Unnamed: 0,scrape_date,date_update,waktu,provinsi,kabkot,kecamatan,kelurahan,alamat,total_odp,total_pdp,total_positif,total_otg,odr_total,total_pp,total_ppdt,source_link,zona,Ket
0,27-Jul-2020,27-Jul-2020,18:38,Kalimantan Selatan,Hulu Sungai Selatan,Angkinang,,,30,0,22,0,0,0,0,https://corona.hulusungaiselatankab.go.id/api/...,Merah,
1,27-Jul-2020,27-Jul-2020,18:38,Kalimantan Selatan,Hulu Sungai Selatan,Daha Barat,,,5,0,4,0,0,0,0,https://corona.hulusungaiselatankab.go.id/api/...,Merah,
2,27-Jul-2020,27-Jul-2020,18:38,Kalimantan Selatan,Hulu Sungai Selatan,Daha Selatan,,,79,1,11,0,0,0,0,https://corona.hulusungaiselatankab.go.id/api/...,Merah,
3,27-Jul-2020,27-Jul-2020,18:38,Kalimantan Selatan,Hulu Sungai Selatan,Daha Utara,,,58,0,52,0,0,0,0,https://corona.hulusungaiselatankab.go.id/api/...,Merah,
4,27-Jul-2020,27-Jul-2020,18:38,Kalimantan Selatan,Hulu Sungai Selatan,Kalumpang,,,14,0,7,0,0,0,0,https://corona.hulusungaiselatankab.go.id/api/...,Merah,
