In [9]:
!pip install BeautifulSoup4



In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import os
import logging

In [2]:
class Logger:
    def __init__(self):
        self.logger = logging.getLogger()
        logfileName = 'etl_project_log.txt'
        log_format = '%(asctime)s, %(message)s'

        if len(self.logger.handlers) == 0:
            date_format = "%Y-%b-%d-%H-%M-%S"
            file_handler = logging.FileHandler(logfileName, mode='a')
            file_handler.setFormatter(logging.Formatter(log_format, datefmt = date_format))
    
            self.logger.setLevel(logging.INFO)
            self.logger.addHandler(file_handler)

    def info(self,value):
        self.logger.info(value)
            
        

In [3]:
# web scaping 방식을 통해 wikipeida 홈페이지에서 데이터 추출 하는 함수
def extract_data():
    log("Extract : start")
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
    response = requests.get(url)
    raw_data = []
    
    if response.status_code == 200:
        html = response.text
        # 문자열로 된 객체를 python object로 변환하는 과정
        soup = BeautifulSoup(html, 'html.parser')
        tbody = soup.select('table.wikitable > tbody')

        rank = 0
        for table in tbody:
            rows = table.find_all('tr')
            for row in rows:
                columns = row.find_all('td')
                if len(columns) > 1:
                    # <sup> 태그와 같은 필요없는 태그 제거하기
                    for column in columns:
                        for tag in column.find_all('sup'):
                            tag.decompose()
                    
                    country = columns[0].get_text(strip = True)
                    gdp = columns[1].get_text(strip = True)
                    year = columns[2].get_text(strip = True)

                    # 없는 데이터 처리하기
                    if gdp == '—' :
                        gdp = None
                        year = None

                    raw_data.append([rank, country, gdp, year])

                    # tr before태그는 beautifulsoup으로 가져오지 못해
                    # rank를 직접 더해주기
                    # 데이터가 없으면 rank 상승 x
                    if None not in (rank, country, gdp, year):
                        rank += 1
                        
        log("Extract : end")
        return raw_data
    else :
        log("Extract : end")
        print(response.status_code)
        return []

In [4]:
# 국가별 GDP 단위를 million USD -> billion USD 변환
# Region을 찾아 입력
def transform_data(raw_data):
    log("Transform : start")
    df = pd.DataFrame(raw_data, columns = ["Rank", "Country", "GDP_USD_billion", "Year"])
    transformed_data = []

    df["GDP_USD_billion"] = df["GDP_USD_billion"].replace(",","",regex = True).astype(float, errors="ignore").div(1000).round(2)
    log("Transform : end")
    return df

In [5]:
def load_data(dataframe):
    log("Load : start")
    fileName = 'db.json'
    dataframe.to_json(fileName, orient = 'index', indent = 4)
    log("Load : end")

In [6]:
def log(message):
    Log = Logger()
    Log.info(message)
    

In [7]:
def print_screen():
    fileName = 'db.json'
    df = pd.read_json(fileName, orient = 'index')

    print("<< GDP가 100B USD이상이 되는 국가만 출력 >>")
    print()
    print(df[df['GDP_USD_billion'] >= 100])
    print()

    print("<< 각 Region별로 top5 국가의 GDP 평균>>")
    

In [9]:
# 전체 과정의 함수
def ETL():
    raw_data = extract_data()
    transformed_data = transform_data(raw_data)
    load_data(transformed_data)
    print_screen()
    
ETL()

<< GDP가 100B USD이상이 되는 국가만 출력 >>

    Rank        Country  GDP_USD_billion    Year
0      0          World        115494.31  2025.0
1      1  United States         30337.16  2025.0
2      2          China         19534.89  2025.0
3      3        Germany          4921.56  2025.0
4      4          Japan          4389.33  2025.0
..   ...            ...              ...     ...
69    68     Uzbekistan           112.65  2024.0
70    69      Guatemala           112.37  2024.0
71    70           Oman           109.99  2024.0
72    71       Bulgaria           108.42  2024.0
73    72      Venezuela           106.33  2024.0

[73 rows x 4 columns]

<< 각 Region별로 top5 국가의 GDP 평균>>
