In [9]:
!pip install BeautifulSoup4
!pip install pycountry-convert

Collecting pycountry-convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl.metadata (7.2 kB)
Collecting pprintpp>=0.3.0 (from pycountry-convert)
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl.metadata (7.9 kB)
Collecting pycountry>=16.11.27.1 (from pycountry-convert)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting pytest>=3.4.0 (from pycountry-convert)
  Downloading pytest-8.3.4-py3-none-any.whl.metadata (7.5 kB)
Collecting pytest-mock>=1.6.3 (from pycountry-convert)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pytest-cov>=2.5.1 (from pycountry-convert)
  Downloading pytest_cov-6.0.0-py3-none-any.whl.metadata (27 kB)
Collecting repoze.lru>=0.7 (from pycountry-convert)
  Downloading repoze.lru-0.7-py3-none-any.whl.metadata (1.1 kB)
Collecting iniconfig (from pytest>=3.4.0->pycountry-convert)
  Downloading iniconfig-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pluggy<2,>=1.5 (from pytest>=3.4.0->pycountry-co

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import pycountry_convert
import pycountry
from multiprocessing import Pool
from multiprocessing import set_start_method
import multiprocessing as mp
import numpy as np
import os

In [49]:
# datetime을 활용하여 구현한 log 파일을 찍는 클래스
# start를 하면 로그파일이 열리게 되고, end를 하면 로그를 찍는것을 중단하게 된다.
class Logger:
    def __init__(self):
        self.logfile_name = 'etl_project_log.txt'
        self.log_format = '{timestamp}, {message}'
        self.timestamp_format = '%Y-%b-%d-%H-%M-%S'
        self.file = None

    def start(self):
        if self.file is None :
            self.file = open(self.logfile_name,mode='a')
            
    def end(self):
        if self.file :
            self.file.write("===============================================" + "\n\n")
            self.file.close()
            self.file = None

    def get_timestamp(self):
        return datetime.now().strftime(self.timestamp_format)
    
    def info(self,message):
        if self.file is None :
            raise RuntimeError()

        timestamp = self.get_timestamp()
        formatted_message = self.log_format.format(timestamp=timestamp, message=message)
        self.file.write(formatted_message + "\n")
        

In [11]:
# web scaping 방식을 통해 wikipeida 홈페이지에서 데이터 추출 하는 함수
# raw_data를 json 형태로 저장
def extract_data():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
    raw_data = []
    # chunk단위로 get 하는 과정
    with requests.get(url, stream = True) as response:
        response.raise_for_status()
        with open('file.html','wb') as file :
            for chunk in response.iter_content(chunk_size = 8192):
                if chunk:
                    file.write(chunk)
        
    with open('file.html','rb') as file:
        html = file.read()
            
    # 문자열로 된 객체를 python object로 변환하는 과정
    soup = BeautifulSoup(html, 'html.parser')
    tbody = soup.select('table.wikitable > tbody')

    for table in tbody:
        rows = table.find_all('tr')
        for row in rows:
            columns = row.find_all('td')
            if len(columns) > 1:
                # <sup> 태그와 같은 필요없는 태그 제거하기
                for column in columns:
                    for tag in column.find_all('sup'):
                        tag.decompose()
                    
                country = columns[0].get_text(strip = True)
                gdp = columns[1].get_text(strip = True)
                year = columns[2].get_text(strip = True)

                # 없는 데이터 처리하기
                if gdp == '—' :
                    gdp = None
                    year = None

                raw_data.append([country, gdp, year])
                        
    df = pd.DataFrame(raw_data, columns = ["Country", "GDP_USD_million", "Year"])
    df.to_json('Countries_by_GDP.json', orient = 'index', indent = 4)

In [9]:
# 국가별 GDP 단위를 million USD -> billion USD 변환
# 각 나라의 Region을 찾아 입력
def transform_data(df):
    df["GDP_USD_billion"] = df["GDP_USD_million"].replace(",","",regex = True).astype(float, errors="ignore").div(1000).round(2)
    df["Region"] = df["Country"].apply(convert_name_to_continent)

    df = df.drop("GDP_USD_million", axis = 1)
    return df

def convert_name_to_continent(country_name):
    try :
        alpha2_code = pycountry_convert.country_name_to_country_alpha2(country_name)
    
        if alpha2_code :
            continent_code = pycountry_convert.country_alpha2_to_continent_code(alpha2_code)
            continent_name = pycountry_convert.convert_continent_code_to_continent_name(continent_code)
            return continent_name
        else : 
            return None

    except Exception :
        # pycountry 에서 지원하지 않는 나라의 이름은 매핑 딕셔너리 이용해서 mapping
        custom_country_mapping = {
            "Kosovo" : "Europe",
            "DR Congo" : "Africa",
            "Zanzibar" : "Africa",
            "East Timor" : "Asia",
            "Sint Maarten" : "North America"
        }

        if country_name in custom_country_mapping:
            return custom_country_mapping[country_name]
        else :
            return None

In [6]:
def load_data(dataframe):
    print_screen(dataframe)

In [7]:
# GDP가 100B USD 이상인 국가와 
# 각 Region별로 top5 국가의 GDP 평균을 출력하는 함수.
def print_screen(df):
    print("<< GDP가 100B USD이상이 되는 국가만 출력 >>")
    print()
    print(df[df['GDP_USD_billion'] >= 100])

    print()
    print("<< 각 Region별로 top5 국가의 GDP 평균>>")
    print()
    grouped = df.groupby('Region')
    print(grouped.apply(lambda g : g.sort_values(by='GDP_USD_billion', ascending = False)[:5]['GDP_USD_billion'].mean(), include_groups = False))    

In [10]:
# 전체 과정의 함수
# 각각의 과정 속에서 로그를 찍음.
def ETL():
    logger = Logger()
    logger.start()

    logger.info("Extract started")
    extract_data()
    logger.info("Extract completed")

    df = pd.read_json('Countries_by_GDP.json', orient = 'index')
    logger.info("Transform started")
    transformed_data = transform_data(df)
    logger.info("Transform completed")

    logger.info("Load started")
    load_data(transformed_data)
    logger.info("Load completed")

    logger.end()

ETL()

<< GDP가 100B USD이상이 되는 국가만 출력 >>

          Country    Year  GDP_USD_billion         Region
0           World  2025.0        115494.31           None
1   United States  2025.0         30337.16  North America
2           China  2025.0         19534.89           Asia
3         Germany  2025.0          4921.56         Europe
4           Japan  2025.0          4389.33           Asia
..            ...     ...              ...            ...
69     Uzbekistan  2024.0           112.65           Asia
70      Guatemala  2024.0           112.37  North America
71           Oman  2024.0           109.99           Asia
72       Bulgaria  2024.0           108.42         Europe
73      Venezuela  2024.0           106.33  South America

[73 rows x 4 columns]

<< 각 Region별로 top5 국가의 GDP 평균>>

Region
Africa            285.184
Asia             6327.178
Europe           3318.112
North America    6946.500
Oceania           436.658
South America     791.566
dtype: float64


In [50]:
# IMF API 사용하기 
# 각 나라별 GDP를 추출하는 함수
def getIMFGdpByCountry(period = 2025):
    url = "https://www.imf.org/external/datamapper/api/v1/NGDPD?periods={pr}".format(pr = period)
    response = requests.get(url)
    raw_data = []
    
    if response.status_code == 200:
        gdp_data = response.json()
        for country, data in gdp_data["values"]["NGDPD"].items():
            for year, gdp in data.items():
                raw_data.append({"Country": country, "Year" : int(year), "GDP" : gdp})

    df = pd.DataFrame(raw_data)
    return df

# 각 지역별 GDP를 추출하는 함수
def getIMFGdpByRegion():
    region_code_url = "https://www.imf.org/external/datamapper/api/v1/regions"
    response = requests.get(region_code_url)
    code_raw_data = dict()

    if response.status_code == 200:
        code_data = response.json()
        for code, data in code_data["regions"].items():
            code_raw_data[code] = data["label"]

    raw_data = []

    region_code_url = "https://www.imf.org/external/datamapper/api/v1/NGDPD?periods=2025".format(RCODE = code)
    response = requests.get(region_code_url)

    if response.status_code == 200:
        gdp_data = response.json()

        if "values" in gdp_data :
            for region, data in gdp_data["values"]["NGDPD"].items():
                if region in code_raw_data.keys():
                    for year, gdp in data.items():
                        raw_data.append({"Region": code_raw_data[region], "Year" : int(year), "GDP" : gdp})
        
    df = pd.DataFrame(raw_data)
    return df
    
getIMFGdpByRegion()

Unnamed: 0,Region,Year,GDP
0,Africa (Region),2025,2822.919
1,Sub-Saharan Africa (Region),2025,1941.527
2,Asia and Pacific,2025,41024.438
3,Australia and New Zealand,2025,2144.064
4,Caribbean,2025,151.571
5,Central America,2025,546.693
6,Central Asia and the Caucasus,2025,2151.706
7,East Asia,2025,27135.09
8,Eastern Europe,2025,5204.838
9,Europe,2025,28219.707
