In [87]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime as dt
import json
import sqlite3
from enum import Enum

In [88]:
class Mode(Enum):
    EXTRACT = 'EXTRACT'
    TRANSFORM = 'TRANSFORM'
    LOAD = 'LOAD'

def writeLog(state: Mode, isStart: bool):
    with open('./data/etl_project_log.txt', 'a') as log:
        time = dt.datetime.now().strftime('%Y-%b-%d-%H-%M-%S')
        if isStart:
            log.write(f'{time}, [{state.value}] Started\n')
        else:
            log.write(f'{time}, [{state.value}] Ended\n')

def getSoup(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    
state = Mode.EXTRACT
# <<< 
# Extract 
# >>>
writeLog(state, True)
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
soup = getSoup(url)

table = soup.select_one('table.wikitable.sortable')
head = table.select('tr.static-row-header')
body = table.find_all('tr')

# RAW data JSON 파일로 저장
with open('./data/Countries_by_GDP.json', 'w', encoding='utf-8') as jsonFile:
    json.dump(table.text, jsonFile, ensure_ascii=False, indent=4)
jsonFile.close()
writeLog(state, False)
state = Mode.TRANSFORM

In [89]:
# <<<
# TRANSFORM
# >>>

# 표의 컬럼 추출
writeLog(state, True)
category = ''
organization = []
for i, item in enumerate(head[0].find_all('th')):
    # category 추출
    if (i == 0):
        category = item.text.strip()
    # 기관 추출
    else:
        organization.append(item.find('a').text)

In [90]:
regionDf = pd.read_csv('./data/region.csv')

In [91]:
# Table Parsing
infoAll = []
for rank, row in enumerate(body):
    if rank < 3: continue
    # 한 행의 정보를 담을 리스트
    info = []
    # 불필요한 정보를 제거
    while (row.sup != None):
        row.sup.decompose()
    # 정보 저장
    for idx, item in enumerate(row):
        value = item.text.strip()
        # 빈 셀 스킵
        if (value == ''): continue
        # 해당 기관의 정보가 없으면 예상치와 년도를 모두 0으로 설정
        elif (value == '—'): 
            info.append('NaN')
            info.append('NaN')
        # 정상 정보면 저장
        else: info.append(item.text.strip())
    # 문자열로 저장된 정보를 숫자로 변환
    for i in range(1, len(info)):
        # GDP 정보면 float
        if i % 2 != 0: info[i] = float(info[i].replace(',',''))
        # 년도 정보면 int
        else: info[i] = float(info[i].replace(',',''))
    # region 정보를 국가 이름과 매칭
    region = regionDf[regionDf['name'] == info[0]]['region'].values[0]
    info.insert(1, region)
    infoAll.append(info)

In [92]:
# DataFrame 컬럼 리스트 생성
tempColumn = head[1].text.strip('\n').split('\n')
columnList = [category, 'region'] + tempColumn

In [93]:
# 모든 기관의 정보가 담긴 DataFrame
gdpDf = pd.DataFrame(infoAll, columns=columnList)
gdpDf['Year'] = gdpDf['Year'].astype('Int64')
gdpDf

Unnamed: 0,Country/Territory,region,Forecast,Year,Estimate,Year.1,Estimate.1,Year.2
0,United States,Americas,30337162.0,2025,27360935.0,2023,25744100.0,2022
1,China,Asia,19534894.0,2025,17794782.0,2023,17963170.0,2022
2,Germany,Europe,4921563.0,2025,4456081.0,2023,4076923.0,2022
3,Japan,Asia,4389326.0,2025,4212945.0,2023,4232173.0,2022
4,India,Asia,4271922.0,2025,3549919.0,2023,3465541.0,2022
...,...,...,...,...,...,...,...,...
204,Kiribati,Oceania,311.0,2024,279.0,2023,223.0,2022
205,Palau,Oceania,308.0,2024,263.0,2023,225.0,2022
206,Marshall Islands,Oceania,305.0,2024,284.0,2023,279.0,2022
207,Nauru,Oceania,161.0,2024,154.0,2023,147.0,2022


In [94]:
# IMF의 정보만 분리
gdpImf = gdpDf.iloc[:,:4]
gdpImf.rename(columns={'Forecast':'GDP', 'Country/Territory': 'Country'}, inplace=True)
gdpImf['GDP_USD_billion'] = round((gdpImf['GDP'] / 1000), 2)
gdpImf.sort_values('GDP_USD_billion', ascending=False, inplace=True)
gdpImf.reset_index(drop=True, inplace=True)
gdpImf

Unnamed: 0,Country,region,GDP,Year,GDP_USD_billion
0,United States,Americas,30337162.0,2025,30337.16
1,China,Asia,19534894.0,2025,19534.89
2,Germany,Europe,4921563.0,2025,4921.56
3,Japan,Asia,4389326.0,2025,4389.33
4,India,Asia,4271922.0,2025,4271.92
...,...,...,...,...,...
204,Greenland,Americas,,,
205,Eritrea,Africa,,,
206,Zanzibar,Africa,,,
207,Sint Maarten,Americas,,,


In [95]:
# GDP가 100B 이상 국가
gdpImf[gdpImf['GDP_USD_billion'] > 100]

Unnamed: 0,Country,region,GDP,Year,GDP_USD_billion
0,United States,Americas,30337162.0,2025,30337.16
1,China,Asia,19534894.0,2025,19534.89
2,Germany,Europe,4921563.0,2025,4921.56
3,Japan,Asia,4389326.0,2025,4389.33
4,India,Asia,4271922.0,2025,4271.92
...,...,...,...,...,...
67,Uzbekistan,Asia,112653.0,2024,112.65
68,Guatemala,Americas,112369.0,2024,112.37
69,Oman,Asia,109993.0,2024,109.99
70,Bulgaria,Europe,108425.0,2024,108.42


In [96]:
# 각 Region 별 상위 5개국 평균 GDP
gdpImfGrouped = gdpImf.set_index(['region'])
temp = gdpImfGrouped.sort_values(by=['region', 'GDP_USD_billion'], ascending=[True, False]).groupby('region').head(5)['GDP_USD_billion']
gdpRegion = temp.groupby(temp.index).mean()
gdpRegion

region
Africa       285.184
Americas    7473.330
Asia        6327.178
Europe      3318.112
Oceania      436.658
Name: GDP_USD_billion, dtype: float64

In [97]:
writeLog(state, False)
state = Mode.LOAD

In [98]:
# <<<
# LOAD
# >>>
writeLog(state, True)
con = sqlite3.connect('./data/World_Economies.db')
gdpImf.to_sql('gdp',con, if_exists='replace')
writeLog(state, False)

### Query를 사용하여 출력하기

In [99]:
con.cursor().execute(
    '''
    SELECT *
    FROM gdp
    WHERE GDP_USD_billion > 100;
''').fetchall()

[(0, 'United States', 'Americas', 30337162.0, 2025, 30337.16),
 (1, 'China', 'Asia', 19534894.0, 2025, 19534.89),
 (2, 'Germany', 'Europe', 4921563.0, 2025, 4921.56),
 (3, 'Japan', 'Asia', 4389326.0, 2025, 4389.33),
 (4, 'India', 'Asia', 4271922.0, 2025, 4271.92),
 (5, 'United Kingdom', 'Europe', 3730261.0, 2025, 3730.26),
 (6, 'France', 'Europe', 3283429.0, 2025, 3283.43),
 (7, 'Italy', 'Europe', 2459597.0, 2025, 2459.6),
 (8, 'Canada', 'Americas', 2330308.0, 2025, 2330.31),
 (9, 'Brazil', 'Americas', 2307162.0, 2025, 2307.16),
 (10, 'Russia', 'Europe', 2195708.0, 2025, 2195.71),
 (11, 'South Korea', 'Asia', 1947133.0, 2025, 1947.13),
 (12, 'Australia', 'Oceania', 1881140.0, 2025, 1881.14),
 (13, 'Spain', 'Europe', 1827576.0, 2025, 1827.58),
 (14, 'Mexico', 'Americas', 1817818.0, 2025, 1817.82),
 (15, 'Indonesia', 'Asia', 1492618.0, 2025, 1492.62),
 (16, 'Turkey', 'Asia', 1455413.0, 2025, 1455.41),
 (17, 'Netherlands', 'Europe', 1272960.0, 2025, 1272.96),
 (18, 'Saudi Arabia', 'Asia',

In [100]:
con.cursor().execute(
    '''
    WITH rankedByRegionGdp AS (
        SELECT
            Country,
            region,
            GDP_USD_billion,
            ROW_NUMBER() OVER (PARTITION BY region ORDER BY GDP DESC) AS rank
        FROM gdp
    )
    SELECT region, AVG(GDP_USD_billion)
    FROM rankedByRegionGdp
    WHERE rank <= 5
    GROUP BY region;
    '''
).fetchall()

[('Africa', 285.184),
 ('Americas', 7473.33),
 ('Asia', 6327.178),
 ('Europe', 3318.112),
 ('Oceania', 436.658)]