# WEB SCRAPING

- Web scraping adalah proses ekstrasi data dari sebuah website.
- Web scraping dilakukan dengan menggunakan web scraper, bot, web spider, atau web crawler. 
- Web scraper sendiri adalah program yang masuk ke halaman website, download kontennya, mengekstrak data dari konten, dan menyimpan data ke satu file atau database.

In [1]:
from bs4 import BeautifulSoup # untuk baca html
import requests # untuk ambil data dari web
import pandas as pd
from IPython.display import display 

# Scraping: World Population Dataset

from: https://www.worldometers.info/world-population/population-by-country/

## Akses data ke web yg ingin diambil datanya

In [2]:
# mengambil data dari web

world_web = requests.get('https://www.worldometers.info/world-population/population-by-country/')
world_web

<Response [200]>

In [3]:
# parsing data

data_web = BeautifulSoup(world_web.content, 'html.parser')
# data_web

## Mengambil data pada tag tertentu

In [4]:
# mencari data judul (yg di browser tab) di web disertai tag html
data_web.title

<title>Population by Country (2020) - Worldometer</title>

In [5]:
# mencari data judul di h1 disertai tag html
data_web.h1

<h1>Countries in the world by population (2020)</h1>

In [6]:
# mencari data judul di web dan outputnya berupa string
data_web.title.string

'Population by Country (2020) - Worldometer'

In [7]:
data_web.title.text

'Population by Country (2020) - Worldometer'

In [8]:
# tag tr adaah perintah untuk membuat satu baris tabel di html
# tag ini tidak terlalu akurat karena di dalamnya masih ada tag yg lebih dekat dengan data yg kita cari

tr = data_web.find_all('tr')

In [9]:
# <p> adalah perintah untuk membuat paragraf html

p = data_web.find_all('p')
p[0].text

'This list includes both countries and dependent territories. Data based on the latest United Nations Population Division estimates. Click on the name of the country or dependency for current estimates (live population clock), historical data, and projected figures.  See also: World Population  '

In [10]:
# <th> adalah perintah untuk membuat baris pertama (header table) di html

th = data_web.find_all('th')
th[7].text

'Migrants (net)'

In [11]:
# <td> adalah perintah untuk membuat baris kedua dst (data table) di html

td = data_web.find_all('td')
td[13].text

'India'

## Mengolah Data

In [12]:
# Menyimpan header table ke variable th
# Menyimpan datatabel ke varible td 
th = data_web.find_all('th')
td = data_web.find_all('td')

# ambil textnya saja
header = [i.text for i in th] 
isi_tabel = [i.text for i in td] 

# header
# isi_tabel

In [13]:
# cari ada berapa row table
jumlah_row = len(isi_tabel)/len(header)
jumlah_row

235.0

In [14]:
world_population = []
index_awal = 0
for i in range(int(jumlah_row)):
    index_akhir = (i+1)*12          # 12 adalah len(header) 
    world_population.append(isi_tabel[index_awal:index_akhir])
    index_awal = index_akhir

world_population[0]


['1',
 'China',
 '1,439,323,776',
 '0.39 %',
 '5,540,090',
 '153',
 '9,388,211',
 '-348,399',
 '1.7',
 '38',
 '61 %',
 '18.47 %']

In [15]:
# Ubah jadi data frame
df = pd.DataFrame(world_population, columns=header)
df.head()

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,China,1439323776,0.39 %,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %
1,2,India,1380004385,0.99 %,13586631,464,2973190,-532687,2.2,28,35 %,17.70 %
2,3,United States,331002651,0.59 %,1937734,36,9147420,954806,1.8,38,83 %,4.25 %
3,4,Indonesia,273523615,1.07 %,2898047,151,1811570,-98955,2.3,30,56 %,3.51 %
4,5,Pakistan,220892340,2.00 %,4327022,287,770880,-233379,3.6,23,35 %,2.83 %


In [16]:
# Ubah nama kolom
df.rename(columns={
    'Yearly Change': 'Yearly Change (%)'
}, inplace=True)

In [17]:
#  menghilangkan unsur '%' di kolom Yearly Change

gudang = []
for i in range(len(df['Yearly Change (%)'])):
    j = df['Yearly Change (%)'][i].split() # split jadi (0.39, %)
    gudang.append(float(j[0])) # indexing angkanya

# gudang
df['Yearly Change (%)'] = gudang


In [18]:
df.head()

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change (%),Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,China,1439323776,0.39,5540090,153,9388211,-348399,1.7,38,61 %,18.47 %
1,2,India,1380004385,0.99,13586631,464,2973190,-532687,2.2,28,35 %,17.70 %
2,3,United States,331002651,0.59,1937734,36,9147420,954806,1.8,38,83 %,4.25 %
3,4,Indonesia,273523615,1.07,2898047,151,1811570,-98955,2.3,30,56 %,3.51 %
4,5,Pakistan,220892340,2.0,4327022,287,770880,-233379,3.6,23,35 %,2.83 %


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   #                        235 non-null    object 
 1   Country (or dependency)  235 non-null    object 
 2   Population (2020)        235 non-null    object 
 3   Yearly Change (%)        235 non-null    float64
 4   Net Change               235 non-null    object 
 5   Density (P/Km²)          235 non-null    object 
 6   Land Area (Km²)          235 non-null    object 
 7   Migrants (net)           235 non-null    object 
 8   Fert. Rate               235 non-null    object 
 9   Med. Age                 235 non-null    object 
 10  Urban Pop %              235 non-null    object 
 11  World Share              235 non-null    object 
dtypes: float64(1), object(11)
memory usage: 22.2+ KB


In [20]:
# df['Density'].astype('int64').dtypes

# Take Class Excercise: Indonesian Population in 1950-2020

## ``Indonesia Population in 10 years period``


In [21]:
# ambil data dari web
indo_pop = requests.get('https://www.worldometers.info/world-population/indonesia-population/')
indo_pop

<Response [200]>

In [22]:
# parsing data 

data_indo = BeautifulSoup(indo_pop.content, 'html.parser')
# data_indo

In [23]:
# ambil header table
th_indo = data_indo.find_all('th')

# ambil isi table 
td_indo = data_indo.find_all('td')

In [24]:
header_indo = [i.text for i in th_indo]
isi_indo = [i.text for i in td_indo]


In [25]:
# ambil header untuk tabel pertama dan kedua
header = header_indo[0:13]
header

['Year',
 'Population',
 'Yearly %  Change',
 'Yearly Change',
 'Migrants (net)',
 'Median Age',
 'Fertility Rate',
 'Density (P/Km²)',
 'Urban Pop %',
 'Urban Population',
 "Country's Share of World Pop",
 'World Population',
 'IndonesiaGlobal Rank']

In [26]:
# ambil isi untuk tabel pertama dan kedua
isi = isi_indo[2:-210]
# isi

In [27]:
len(isi)/len(header)

25.0

In [28]:
indo_population = []

index_awal = 0
for i in range(25):                 # 25 adalah jumlah baris
    index_akhir = (i+1)*13          # 13 adalah len(header) 
    indo_population.append(isi[index_awal:index_akhir])
    index_awal = index_akhir

indo_population[0]

['2020',
 '273,523,615',
 '1.07 %',
 '2,898,047',
 '-98,955',
 '29.7',
 '2.32',
 '151',
 '56.4 %',
 '154,188,546',
 '3.51 %',
 '7,794,798,739',
 '4']

In [29]:
df_indo = pd.DataFrame(indo_population, columns=header)
df_indo.head()

Unnamed: 0,Year,Population,Yearly % Change,Yearly Change,Migrants (net),Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population,Country's Share of World Pop,World Population,IndonesiaGlobal Rank
0,2020,273523615,1.07 %,2898047,-98955,29.7,2.32,151,56.4 %,154188546,3.51 %,7794798739,4
1,2019,270625568,1.10 %,2955025,-98955,28.8,2.42,149,55.8 %,150900390,3.51 %,7713468100,4
2,2018,267670543,1.14 %,3019580,-98955,28.8,2.42,148,55.1 %,147603006,3.51 %,7631091040,4
3,2017,264650963,1.18 %,3094582,-98955,28.8,2.42,146,54.5 %,144294861,3.51 %,7547858925,4
4,2016,261556381,1.23 %,3173125,-98955,28.8,2.42,144,53.9 %,140972925,3.50 %,7464022049,4


In [30]:
df_indo = df_indo.sort_values(by='Year')
df_indo = df_indo.loc[:,['Year','Population','Median Age','Fertility Rate','Urban Pop %']]
# df_indo = df_indo.drop_duplicates(subset='Year') # ada 2020 double, hapus duplikatnya

df_indo

Unnamed: 0,Year,Population,Median Age,Fertility Rate,Urban Pop %
17,1955,77273425,20.4,5.49,13.5 %
16,1960,87751068,20.2,5.67,14.6 %
15,1965,100267062,19.4,5.62,15.8 %
14,1970,114793178,18.6,5.57,17.1 %
13,1975,130680727,18.5,5.3,19.3 %
12,1980,147447836,19.1,4.73,22.1 %
11,1985,164982451,19.9,4.11,26.1 %
10,1990,181413402,21.3,3.4,30.6 %
9,1995,196934260,22.8,2.9,36.1 %
8,2000,211513823,24.4,2.55,42.0 %


In [31]:
#  menghilangkan unsur '%' di kolom Urban Pop %

kosong = []
for i in range(len(df_indo['Urban Pop %'])):
    j = df_indo['Urban Pop %'][i].split() # split jadi (0.39, %)
    kosong.append(float(j[0])) # indexing angkanya

df_indo['Urban Pop %'] = kosong


In [32]:
df_indo = df_indo.drop_duplicates(subset='Year') # ada 2020 double, hapus duplikatnya
df_indo

Unnamed: 0,Year,Population,Median Age,Fertility Rate,Urban Pop %
17,1955,77273425,20.4,5.49,56.4
16,1960,87751068,20.2,5.67,55.8
15,1965,100267062,19.4,5.62,55.1
14,1970,114793178,18.6,5.57,54.5
13,1975,130680727,18.5,5.3,53.9
12,1980,147447836,19.1,4.73,53.3
11,1985,164982451,19.9,4.11,50.1
10,1990,181413402,21.3,3.4,46.0
9,1995,196934260,22.8,2.9,42.0
8,2000,211513823,24.4,2.55,36.1


In [33]:
# indexing per 10 tahun

decade = []

for i in df_indo['Year']:
    if int(i)%10 == 0:
        decade.append(i)

decade

['1960',
 '1970',
 '1980',
 '1990',
 '2000',
 '2010',
 '2020',
 '2030',
 '2040',
 '2050']

In [34]:
df_indo

Unnamed: 0,Year,Population,Median Age,Fertility Rate,Urban Pop %
17,1955,77273425,20.4,5.49,56.4
16,1960,87751068,20.2,5.67,55.8
15,1965,100267062,19.4,5.62,55.1
14,1970,114793178,18.6,5.57,54.5
13,1975,130680727,18.5,5.3,53.9
12,1980,147447836,19.1,4.73,53.3
11,1985,164982451,19.9,4.11,50.1
10,1990,181413402,21.3,3.4,46.0
9,1995,196934260,22.8,2.9,42.0
8,2000,211513823,24.4,2.55,36.1


In [35]:
df_indo[df_indo['Year'].isin(decade)].reset_index(drop=True)

Unnamed: 0,Year,Population,Median Age,Fertility Rate,Urban Pop %
0,1960,87751068,20.2,5.67,55.8
1,1970,114793178,18.6,5.57,54.5
2,1980,147447836,19.1,4.73,53.3
3,1990,181413402,21.3,3.4,46.0
4,2000,211513823,24.4,2.55,36.1
5,2010,241834215,27.2,2.5,26.1
6,2020,273523615,29.7,2.32,13.5
7,2030,299198430,32.4,2.32,62.1
8,2040,318637858,35.1,2.32,66.8
9,2050,330904664,37.4,2.32,70.7
