In [9]:
!pip install BeautifulSoup4
!pip install pycountry-convert

Collecting pycountry-convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl.metadata (7.2 kB)
Collecting pprintpp>=0.3.0 (from pycountry-convert)
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl.metadata (7.9 kB)
Collecting pycountry>=16.11.27.1 (from pycountry-convert)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting pytest>=3.4.0 (from pycountry-convert)
  Downloading pytest-8.3.4-py3-none-any.whl.metadata (7.5 kB)
Collecting pytest-mock>=1.6.3 (from pycountry-convert)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pytest-cov>=2.5.1 (from pycountry-convert)
  Downloading pytest_cov-6.0.0-py3-none-any.whl.metadata (27 kB)
Collecting repoze.lru>=0.7 (from pycountry-convert)
  Downloading repoze.lru-0.7-py3-none-any.whl.metadata (1.1 kB)
Collecting iniconfig (from pytest>=3.4.0->pycountry-convert)
  Downloading iniconfig-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pluggy<2,>=1.5 (from pytest>=3.4.0->pycountry-co

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import pycountry_convert
import pycountry
import sqlite3

In [2]:
class Logger:
    def __init__(self):
        self.logfile_name = 'etl_project_log.txt'
        self.log_format = '{timestamp}, {message}'
        self.timestamp_format = '%Y-%b-%d-%H-%M-%S'
        self.file = None

    def start(self):
        if self.file is None :
            self.file = open(self.logfile_name,mode='a')
            
    def end(self):
        if self.file :
            self.file.write("===============================================" + "\n\n")
            self.file.close()
            self.file = None

    def get_timestamp(self):
        return datetime.now().strftime(self.timestamp_format)
    
    def info(self,message):
        if self.file is None :
            raise RuntimeError()

        timestamp = self.get_timestamp()
        formatted_message = self.log_format.format(timestamp=timestamp, message=message)
        self.file.write(formatted_message + "\n")
        

In [3]:
# web scaping 방식을 통해 wikipeida 홈페이지에서 데이터 추출 하는 함수
# raw_data를 json 형태로 저장
def extract_data():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
    response = requests.get(url)
    raw_data = []
    
    if response.status_code == 200:
        html = response.text
        # 문자열로 된 객체를 python object로 변환하는 과정
        soup = BeautifulSoup(html, 'html.parser')
        tbody = soup.select('table.wikitable > tbody')

        for table in tbody:
            rows = table.find_all('tr')
            for row in rows:
                columns = row.find_all('td')
                if len(columns) > 1:
                    # <sup> 태그와 같은 필요없는 태그 제거하기
                    for column in columns:
                        for tag in column.find_all('sup'):
                            tag.decompose()
                    
                    country = columns[0].get_text(strip = True)
                    gdp = columns[1].get_text(strip = True)
                    year = columns[2].get_text(strip = True)

                    # 없는 데이터 처리하기
                    if gdp == '—' :
                        gdp = None
                        year = None

                    raw_data.append([country, gdp, year])
                        
        df = pd.DataFrame(raw_data, columns = ["Country", "GDP_USD_million", "Year"])
        df.to_json('Countries_by_GDP.json', orient = 'index', indent = 4)
    else :
        print(response.status_code)
        return []

In [4]:
# 국가별 GDP 단위를 million USD -> billion USD 변환
# Region을 찾아 입력
def transform_data():
    df = pd.read_json('Countries_by_GDP.json', orient = 'index')

    df["GDP_USD_billion"] = df["GDP_USD_million"].replace(",","",regex = True).astype(float, errors="ignore").div(1000).round(2)
    df["Region"] = df["Country"].apply(convert_name_to_continent)

    df = df.drop("GDP_USD_million", axis = 1)
    return df

def convert_name_to_continent(country_name):
    try :
        alpha2_code = pycountry_convert.country_name_to_country_alpha2(country_name)
    
        if alpha2_code :
            continent_code = pycountry_convert.country_alpha2_to_continent_code(alpha2_code)
            continent_name = pycountry_convert.convert_continent_code_to_continent_name(continent_code)
            return continent_name
        else : 
            return None

    except Exception :
        # pycountry 에서 지원하지 않는 나라의 이름은 매핑 딕셔너리 이용해서 mapping
        custom_country_mapping = {
            "Kosovo" : "Europe",
            "DR Congo" : "Africa",
            "Zanzibar" : "Africa",
            "East Timor" : "Asia",
            "Sint Maarten" : "North America"
        }

        if country_name in custom_country_mapping:
            return custom_country_mapping[country_name]
        else :
            return None

In [9]:
def load_data(df):
    conn = sqlite3.connect('World_Economies.db')
    df.to_sql('Countries_by_GDP', conn, if_exists='replace')
    conn.close()

In [24]:
def print_screen():
    conn = sqlite3.connect('World_Economies.db')
    cur = conn.cursor()

    print("<< GDP가 100B USD이상이 되는 국가만 출력 >>")
    print()
    for row in cur.execute("""
    SELECT * FROM Countries_by_GDP WHERE GDP_USD_billion >= 100;
    """).fetchall():
        print(row)

    print()
    print("<< 각 Region별로 top5 국가의 GDP 평균>>")
    for row in cur.execute("""
    SELECT Region, AVG(GDP_USD_billion)
    FROM (
        SELECT Country, Region, GDP_USD_billion, ROW_NUMBER() OVER
        (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS rn
        FROM Countries_by_GDP
        WHERE Region IS NOT NULL
    ) 
    WHERE rn <= 5
    GROUP BY(Region)
    ;
    """).fetchall():
        print(row)
    print()

    cur.close()
    conn.close()


In [25]:
# 전체 과정의 함수
def ETL():
    logger = Logger()
    logger.start()

    logger.info("Extract started")
    extract_data()
    logger.info("Extract completed")

    logger.info("Transform started")
    transformed_data = transform_data()
    logger.info("Transform completed")

    logger.info("Load started")
    load_data(transformed_data)
    logger.info("Load completed")

    logger.end()
    print_screen()
ETL()

<< GDP가 100B USD이상이 되는 국가만 출력 >>

(0, 'World', 2025.0, 115494.31, None)
(1, 'United States', 2025.0, 30337.16, 'North America')
(2, 'China', 2025.0, 19534.89, 'Asia')
(3, 'Germany', 2025.0, 4921.56, 'Europe')
(4, 'Japan', 2025.0, 4389.33, 'Asia')
(5, 'India', 2025.0, 4271.92, 'Asia')
(6, 'United Kingdom', 2025.0, 3730.26, 'Europe')
(7, 'France', 2025.0, 3283.43, 'Europe')
(8, 'Italy', 2025.0, 2459.6, 'Europe')
(9, 'Canada', 2025.0, 2330.31, 'North America')
(10, 'Brazil', 2025.0, 2307.16, 'South America')
(11, 'Russia', 2025.0, 2195.71, 'Europe')
(12, 'South Korea', 2025.0, 1947.13, 'Asia')
(13, 'Australia', 2025.0, 1881.14, 'Oceania')
(14, 'Spain', 2025.0, 1827.58, 'Europe')
(15, 'Mexico', 2025.0, 1817.82, 'North America')
(16, 'Indonesia', 2025.0, 1492.62, 'Asia')
(17, 'Turkey', 2025.0, 1455.41, 'Asia')
(18, 'Netherlands', 2025.0, 1272.96, 'Europe')
(19, 'Saudi Arabia', 2025.0, 1136.58, 'Asia')
(20, 'Switzerland', 2025.0, 999.6, 'Europe')
(21, 'Poland', 2025.0, 915.45, 'Europe')
(22,